diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 000000000..bf9494646 --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,58 @@ +## Please follow the guide below + +- You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly +- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x]) +- Use *Preview* tab to see how your issue will actually look like + +--- + +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.06*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.06** + +### Before submitting an *issue* make sure you have: +- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections +- [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones + +### What is the purpose of your *issue*? +- [ ] Bug report (encountered problems with youtube-dl) +- [ ] Site support request (request for adding support for a new site) +- [ ] Feature request (request for a new functionality) +- [ ] Question +- [ ] Other + +--- + +### The following sections concretize particular purposed issues, you can erase any section (the contents between triple ---) not applicable to your *issue* + +--- + +### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows: + +Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```): +``` +$ youtube-dl -v +[debug] System config: [] +[debug] User config: [] +[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 +[debug] youtube-dl version 2016.04.06 +[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 +[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 +[debug] Proxy map: {} +... + +``` + +--- + +### If the purpose of this *issue* is a *site support request* please provide all kinds of example URLs support for which should be included (replace following example URLs by **yours**): +- Single video: https://www.youtube.com/watch?v=BaW_jenozKc +- Single video: https://youtu.be/BaW_jenozKc +- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc + +--- + +### Description of your *issue*, suggested solution and other information + +Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. +If work on your *issue* required an account credentials please provide them or explain how one can obtain them. diff --git a/.github/ISSUE_TEMPLATE_tmpl.md b/.github/ISSUE_TEMPLATE_tmpl.md new file mode 100644 index 000000000..a5e6a4233 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl.md @@ -0,0 +1,58 @@ +## Please follow the guide below + +- You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly +- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x]) +- Use *Preview* tab to see how your issue will actually look like + +--- + +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *%(version)s*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **%(version)s** + +### Before submitting an *issue* make sure you have: +- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections +- [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones + +### What is the purpose of your *issue*? +- [ ] Bug report (encountered problems with youtube-dl) +- [ ] Site support request (request for adding support for a new site) +- [ ] Feature request (request for a new functionality) +- [ ] Question +- [ ] Other + +--- + +### The following sections concretize particular purposed issues, you can erase any section (the contents between triple ---) not applicable to your *issue* + +--- + +### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows: + +Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```): +``` +$ youtube-dl -v +[debug] System config: [] +[debug] User config: [] +[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 +[debug] youtube-dl version %(version)s +[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 +[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 +[debug] Proxy map: {} +... + +``` + +--- + +### If the purpose of this *issue* is a *site support request* please provide all kinds of example URLs support for which should be included (replace following example URLs by **yours**): +- Single video: https://www.youtube.com/watch?v=BaW_jenozKc +- Single video: https://youtu.be/BaW_jenozKc +- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc + +--- + +### Description of your *issue*, suggested solution and other information + +Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. +If work on your *issue* required an account credentials please provide them or explain how one can obtain them. diff --git a/.gitignore b/.gitignore index 0422adf44..72c10425d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.pyc *.pyo +*.class *~ *.DS_Store wine-py2exe/ @@ -12,6 +13,7 @@ README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.fish +youtube_dl/extractor/lazy_extractors.py youtube-dl youtube-dl.exe youtube-dl.tar.gz @@ -32,4 +34,4 @@ test/testdata .tox youtube-dl.zsh .idea -.idea/* \ No newline at end of file +.idea/* diff --git a/AUTHORS b/AUTHORS index 0e90a3ecb..ea8d39978 100644 --- a/AUTHORS +++ b/AUTHORS @@ -160,3 +160,10 @@ Erwin de Haan Jens Wille Robin Houtevelts Patrick Griffis +Aidan Rowe +mutantmonkey +Ben Congdon +Kacper Michajłow +José Joaquín Atria +Viťas Strádal +Kagami Hiiragi diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 39472c554..0df6193fb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,14 +85,16 @@ To run the test, simply invoke your favorite test runner, or execute a test file If you want to create a build of youtube-dl yourself, you'll need * python -* make +* make (both GNU make and BSD make are supported) * pandoc * zip * nosetests ### Adding support for a new site -If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`): +If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. + +After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) 2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` @@ -140,16 +142,17 @@ If you want to add support for a new site, you can follow this quick list (assum ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. -8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). -9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L68-L226). Add tests and code for as many as you want. +8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L138-L226) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`. +9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). +10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/__init__.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! diff --git a/Makefile b/Makefile index cb449b7e6..06cffcb71 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,9 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete + find . -name "*.class" -delete PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin @@ -11,15 +12,7 @@ SHAREDIR ?= $(PREFIX)/share PYTHON ?= /usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local -ifeq ($(PREFIX),/usr) - SYSCONFDIR=/etc -else - ifeq ($(PREFIX),/usr/local) - SYSCONFDIR=/etc - else - SYSCONFDIR=$(PREFIX)/etc - endif -endif +SYSCONFDIR != if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish install -d $(DESTDIR)$(BINDIR) @@ -44,7 +37,7 @@ test: ot: offlinetest offlinetest: codetest - nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py + $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py tar: youtube-dl.tar.gz @@ -66,6 +59,9 @@ README.md: youtube_dl/*.py youtube_dl/*/*.py CONTRIBUTING.md: README.md $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md +.github/ISSUE_TEMPLATE.md: devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl.md youtube_dl/version.py + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl.md .github/ISSUE_TEMPLATE.md + supportedsites: $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md @@ -92,6 +88,12 @@ youtube-dl.fish: youtube_dl/*.py youtube_dl/*/*.py devscripts/fish-completion.in fish-completion: youtube-dl.fish +lazy-extractors: youtube_dl/extractor/lazy_extractors.py + +_EXTRACTOR_FILES != find youtube_dl/extractor -iname '*.py' -and -not -iname 'lazy_extractors.py' +youtube_dl/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) + $(PYTHON) devscripts/make_lazy_extractors.py $@ + youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish @tar -czf youtube-dl.tar.gz --transform "s|^|youtube-dl/|" --owner 0 --group 0 \ --exclude '*.DS_Store' \ diff --git a/README.md b/README.md index cf03e0d54..cd18edd87 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,8 @@ which means you can modify it, redistribute it or use it however you like. on Windows) --flat-playlist Do not extract the videos of a playlist, only list them. + --mark-watched Mark videos watched (YouTube only) + --no-mark-watched Do not mark videos watched (YouTube only) --no-color Do not emit color codes in output ## Network Options: @@ -162,6 +164,8 @@ which means you can modify it, redistribute it or use it however you like. (e.g. 50K or 4.2M) -R, --retries RETRIES Number of retries (default is 10), or "infinite". + --fragment-retries RETRIES Number of retries for a fragment (default + is 10), or "infinite" (DASH only) --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) (default is 1024) --no-resize-buffer Do not automatically adjust the buffer @@ -179,7 +183,7 @@ which means you can modify it, redistribute it or use it however you like. to play it) --external-downloader COMMAND Use the specified external downloader. Currently supports - aria2c,axel,curl,httpie,wget + aria2c,avconv,axel,curl,ffmpeg,httpie,wget --external-downloader-args ARGS Give these arguments to the external downloader @@ -374,8 +378,8 @@ which means you can modify it, redistribute it or use it however you like. --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default - --embed-subs Embed subtitles in the video (only for mkv - and mp4 videos) + --embed-subs Embed subtitles in the video (only for mp4, + webm and mkv videos) --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / @@ -409,13 +413,18 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime and use a proxy: +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. + +For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` ---extract-audio +-x --no-mtime --proxy 127.0.0.1:3128 +-o ~/Movies/%(title)s.%(ext)s ``` +Note that options in configuration file are just the same options aka switches used in regular command line calls thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. + You can use `--ignore-config` if you want to disable the configuration file for a particular youtube-dl run. ### Authentication with `.netrc` file @@ -453,6 +462,7 @@ The basic usage is not to set any template arguments when downloading a single f - `alt_title`: A secondary title of the video - `display_id`: An alternative identifier for the video - `uploader`: Full name of the video uploader + - `license`: License name the video is licensed under - `creator`: The main artist who created the video - `release_date`: The date (YYYYMMDD) when the video was released - `timestamp`: UNIX timestamp of the moment the video became available @@ -590,6 +600,7 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin - `vcodec`: Name of the video codec in use - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case. `http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `m3u8`, or `m3u8_native` + - `format_id`: A short description of the format Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by video hoster. @@ -599,7 +610,7 @@ You can merge the video and audio of two formats into a single file using `-f '), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': "a 'b' c"}) + self.assertEqual(extract_attributes(''), {'x': 'a "b" c'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '&'}) # XML + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes(''), {'x': '&foo'}) + self.assertEqual(extract_attributes(''), {'x': "'"}) + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': None}) + self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) + self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) + self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'}) + self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'}) + # "Narrow" Python builds don't support unicode code points outside BMP. + try: + compat_chr(0x10000) + supports_outside_bmp = True + except ValueError: + supports_outside_bmp = False + if supports_outside_bmp: + self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'}) + def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') @@ -614,6 +702,17 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_filesize('1.2Tb'), 1200000000000) self.assertEqual(parse_filesize('1,24 KB'), 1240) + def test_parse_count(self): + self.assertEqual(parse_count(None), None) + self.assertEqual(parse_count(''), None) + self.assertEqual(parse_count('0'), 0) + self.assertEqual(parse_count('1000'), 1000) + self.assertEqual(parse_count('1.000'), 1000) + self.assertEqual(parse_count('1.1k'), 1100) + self.assertEqual(parse_count('1.1kk'), 1100000) + self.assertEqual(parse_count('1.1kk '), 1100000) + self.assertEqual(parse_count('1.1kk views'), 1100000) + def test_version_tuple(self): self.assertEqual(version_tuple('1'), (1,)) self.assertEqual(version_tuple('10.23.344'), (10, 23, 344)) @@ -801,5 +900,16 @@ The first line ohdave_rsa_encrypt(b'aa111222', e, N), '726664bd9a23fd0c70f9f1b84aab5e3905ce1e45a584e9cbcf9bcc7510338fc1986d6c599ff990d923aa43c51c0d9013cd572e13bc58f4ae48f2ed8c0b0ba881') + def test_encode_base_n(self): + self.assertEqual(encode_base_n(0, 30), '0') + self.assertEqual(encode_base_n(80, 30), '2k') + + custom_table = '9876543210ZYXWVUTSRQPONMLKJIHGFEDCBA' + self.assertEqual(encode_base_n(0, 30, custom_table), '9') + self.assertEqual(encode_base_n(80, 30, custom_table), '7P') + + self.assertRaises(ValueError, encode_base_n, 0, 70) + self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table) + if __name__ == '__main__': unittest.main() diff --git a/tox.ini b/tox.ini index 48504329f..2d7134005 100644 --- a/tox.ini +++ b/tox.ini @@ -8,6 +8,6 @@ deps = passenv = HOME defaultargs = test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py - --exclude test_youtube_lists.py + --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=youtube_dl --cover-html # test.test_download:TestDownload.test_NowVideo diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f4324039c..a89a71a25 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -24,9 +24,6 @@ import time import tokenize import traceback -if os.name == 'nt': - import ctypes - from .compat import ( compat_basestring, compat_cookiejar, @@ -34,6 +31,7 @@ from .compat import ( compat_get_terminal_size, compat_http_client, compat_kwargs, + compat_os_name, compat_str, compat_tokenize_tokenize, compat_urllib_error, @@ -41,6 +39,8 @@ from .compat import ( compat_urllib_request_DataHandler, ) from .utils import ( + age_restricted, + args_to_str, ContentTooShortError, date_from_str, DateRange, @@ -60,13 +60,16 @@ from .utils import ( PagedList, parse_filesize, PerRequestProxyHandler, - PostProcessingError, platform_name, + PostProcessingError, preferredencoding, + prepend_extension, render_table, + replace_extension, SameFileError, sanitize_filename, sanitize_path, + sanitize_url, sanitized_Request, std_headers, subtitles_filename, @@ -77,16 +80,13 @@ from .utils import ( write_string, YoutubeDLCookieProcessor, YoutubeDLHandler, - prepend_extension, - replace_extension, - args_to_str, - age_restricted, ) from .cache import Cache -from .extractor import get_info_extractor, gen_extractors +from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER from .downloader import get_suitable_downloader from .downloader.rtmp import rtmpdump_version from .postprocessor import ( + FFmpegFixupM3u8PP, FFmpegFixupM4aPP, FFmpegFixupStretchedPP, FFmpegMergerPP, @@ -95,6 +95,9 @@ from .postprocessor import ( ) from .version import __version__ +if compat_os_name == 'nt': + import ctypes + class YoutubeDL(object): """YoutubeDL class. @@ -375,8 +378,9 @@ class YoutubeDL(object): def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) - self._ies_instances[ie.ie_key()] = ie - ie.set_downloader(self) + if not isinstance(ie, type): + self._ies_instances[ie.ie_key()] = ie + ie.set_downloader(self) def get_info_extractor(self, ie_key): """ @@ -394,7 +398,7 @@ class YoutubeDL(object): """ Add the InfoExtractors returned by gen_extractors to the end of the list """ - for ie in gen_extractors(): + for ie in gen_extractor_classes(): self.add_info_extractor(ie) def add_post_processor(self, pp): @@ -450,7 +454,7 @@ class YoutubeDL(object): def to_console_title(self, message): if not self.params.get('consoletitle', False): return - if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): + if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): # c_wchar_p() might not be necessary if `message` is # already of type unicode() ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) @@ -521,7 +525,7 @@ class YoutubeDL(object): else: if self.params.get('no_warnings'): return - if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': + if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': _msg_header = '\033[0;33mWARNING:\033[0m' else: _msg_header = 'WARNING:' @@ -533,7 +537,7 @@ class YoutubeDL(object): Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. ''' - if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': + if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': _msg_header = '\033[0;31mERROR:\033[0m' else: _msg_header = 'ERROR:' @@ -566,7 +570,7 @@ class YoutubeDL(object): elif template_dict.get('height'): template_dict['resolution'] = '%sp' % template_dict['height'] elif template_dict.get('width'): - template_dict['resolution'] = '?x%d' % template_dict['width'] + template_dict['resolution'] = '%dx?' % template_dict['width'] sanitize = lambda k, v: sanitize_filename( compat_str(v), @@ -658,6 +662,7 @@ class YoutubeDL(object): if not ie.suitable(url): continue + ie = self.get_info_extractor(ie.ie_key()) if not ie.working(): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') @@ -903,7 +908,7 @@ class YoutubeDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?Pext|acodec|vcodec|container|protocol) + \s*(?Pext|acodec|vcodec|container|protocol|format_id) \s*(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9._-]+) \s*$ @@ -1227,12 +1232,20 @@ class YoutubeDL(object): t.get('preference'), t.get('width'), t.get('height'), t.get('id'), t.get('url'))) for i, t in enumerate(thumbnails): + t['url'] = sanitize_url(t['url']) if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) if t.get('id') is None: t['id'] = '%d' % i - if thumbnails and 'thumbnail' not in info_dict: + if self.params.get('list_thumbnails'): + self.list_thumbnails(info_dict) + return + + thumbnail = info_dict.get('thumbnail') + if thumbnail: + info_dict['thumbnail'] = sanitize_url(thumbnail) + elif thumbnails: info_dict['thumbnail'] = thumbnails[-1]['url'] if 'display_id' not in info_dict and 'id' in info_dict: @@ -1257,6 +1270,8 @@ class YoutubeDL(object): if subtitles: for _, subtitle in subtitles.items(): for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) if 'ext' not in subtitle_format: subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() @@ -1286,6 +1301,8 @@ class YoutubeDL(object): if 'url' not in format: raise ExtractorError('Missing "url" key in result (index %d)' % i) + format['url'] = sanitize_url(format['url']) + if format.get('format_id') is None: format['format_id'] = compat_str(i) else: @@ -1333,9 +1350,6 @@ class YoutubeDL(object): if self.params.get('listformats'): self.list_formats(info_dict) return - if self.params.get('list_thumbnails'): - self.list_thumbnails(info_dict) - return req_format = self.params.get('format') if req_format is None: @@ -1631,12 +1645,14 @@ class YoutubeDL(object): self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) return - if success: + if success and filename != '-': # Fixup content fixup_policy = self.params.get('fixup') if fixup_policy is None: fixup_policy = 'detect_or_warn' + INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.' + stretched_ratio = info_dict.get('stretched_ratio') if stretched_ratio is not None and stretched_ratio != 1: if fixup_policy == 'warn': @@ -1649,15 +1665,18 @@ class YoutubeDL(object): info_dict['__postprocessors'].append(stretched_pp) else: self.report_warning( - '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % ( - info_dict['id'], stretched_ratio)) + '%s: Non-uniform pixel ratio (%s). %s' + % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE)) else: assert fixup_policy in ('ignore', 'never') - if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash': + if (info_dict.get('requested_formats') is None and + info_dict.get('container') == 'm4a_dash'): if fixup_policy == 'warn': - self.report_warning('%s: writing DASH m4a. Only some players support this container.' % ( - info_dict['id'])) + self.report_warning( + '%s: writing DASH m4a. ' + 'Only some players support this container.' + % info_dict['id']) elif fixup_policy == 'detect_or_warn': fixup_pp = FFmpegFixupM4aPP(self) if fixup_pp.available: @@ -1665,8 +1684,27 @@ class YoutubeDL(object): info_dict['__postprocessors'].append(fixup_pp) else: self.report_warning( - '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % ( - info_dict['id'])) + '%s: writing DASH m4a. ' + 'Only some players support this container. %s' + % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) + else: + assert fixup_policy in ('ignore', 'never') + + if (info_dict.get('protocol') == 'm3u8_native' or + info_dict.get('protocol') == 'm3u8' and + self.params.get('hls_prefer_native')): + if fixup_policy == 'warn': + self.report_warning('%s: malformated aac bitstream.' % ( + info_dict['id'])) + elif fixup_policy == 'detect_or_warn': + fixup_pp = FFmpegFixupM3u8PP(self) + if fixup_pp.available: + info_dict.setdefault('__postprocessors', []) + info_dict['__postprocessors'].append(fixup_pp) + else: + self.report_warning( + '%s: malformated aac bitstream. %s' + % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) else: assert fixup_policy in ('ignore', 'never') @@ -1809,7 +1847,7 @@ class YoutubeDL(object): if fdict.get('language'): if res: res += ' ' - res += '[%s]' % fdict['language'] + res += '[%s] ' % fdict['language'] if fdict.get('format_note') is not None: res += fdict['format_note'] + ' ' if fdict.get('tbr') is not None: @@ -1830,7 +1868,9 @@ class YoutubeDL(object): if fdict.get('vbr') is not None: res += '%4dk' % fdict['vbr'] if fdict.get('fps') is not None: - res += ', %sfps' % fdict['fps'] + if res: + res += ', ' + res += '%sfps' % fdict['fps'] if fdict.get('acodec') is not None: if res: res += ', ' @@ -1873,13 +1913,8 @@ class YoutubeDL(object): def list_thumbnails(self, info_dict): thumbnails = info_dict.get('thumbnails') if not thumbnails: - tn_url = info_dict.get('thumbnail') - if tn_url: - thumbnails = [{'id': '0', 'url': tn_url}] - else: - self.to_screen( - '[info] No thumbnails present for %s' % info_dict['id']) - return + self.to_screen('[info] No thumbnails present for %s' % info_dict['id']) + return self.to_screen( '[info] Thumbnails for %s:' % info_dict['id']) @@ -1924,6 +1959,8 @@ class YoutubeDL(object): write_string(encoding_str, encoding=None) self._write_string('[debug] youtube-dl version ' + __version__ + '\n') + if _LAZY_LOADER: + self._write_string('[debug] Lazy loading extractors enabled' + '\n') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f5f064241..737f6545d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -144,14 +144,20 @@ def _real_main(argv=None): if numeric_limit is None: parser.error('invalid max_filesize specified') opts.max_filesize = numeric_limit - if opts.retries is not None: - if opts.retries in ('inf', 'infinite'): - opts_retries = float('inf') + + def parse_retries(retries): + if retries in ('inf', 'infinite'): + parsed_retries = float('inf') else: try: - opts_retries = int(opts.retries) + parsed_retries = int(retries) except (TypeError, ValueError): parser.error('invalid retry count specified') + return parsed_retries + if opts.retries is not None: + opts.retries = parse_retries(opts.retries) + if opts.fragment_retries is not None: + opts.fragment_retries = parse_retries(opts.fragment_retries) if opts.buffersize is not None: numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) if numeric_buffersize is None: @@ -299,7 +305,8 @@ def _real_main(argv=None): 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, 'nooverwrites': opts.nooverwrites, - 'retries': opts_retries, + 'retries': opts.retries, + 'fragment_retries': opts.fragment_retries, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, 'continuedl': opts.continue_dl, @@ -355,6 +362,7 @@ def _real_main(argv=None): 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, 'encoding': opts.encoding, 'extract_flat': opts.extract_flat, + 'mark_watched': opts.mark_watched, 'merge_output_format': opts.merge_output_format, 'postprocessors': postprocessors, 'fixup': opts.fixup, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b497da696..0b6c5ca7a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -77,6 +77,11 @@ try: except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +try: + from html.parser import HTMLParser as compat_HTMLParser +except ImportError: # Python 2 + from HTMLParser import HTMLParser as compat_HTMLParser + try: from subprocess import DEVNULL @@ -164,6 +169,32 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) +try: + from urllib.parse import urlencode as compat_urllib_parse_urlencode +except ImportError: # Python 2 + # Python 2 will choke in urlencode on mixture of byte and unicode strings. + # Possible solutions are to either port it from python 3 with all + # the friends or manually ensure input query contains only byte strings. + # We will stick with latter thus recursively encoding the whole query. + def compat_urllib_parse_urlencode(query, doseq=0, encoding='utf-8'): + def encode_elem(e): + if isinstance(e, dict): + e = encode_dict(e) + elif isinstance(e, (list, tuple,)): + list_e = encode_list(e) + e = tuple(list_e) if isinstance(e, tuple) else list_e + elif isinstance(e, compat_str): + e = e.encode(encoding) + return e + + def encode_dict(d): + return dict((encode_elem(k), encode_elem(v)) for k, v in d.items()) + + def encode_list(l): + return [encode_elem(e) for e in l] + + return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) + try: from urllib.request import DataHandler as compat_urllib_request_DataHandler except ImportError: # Python < 3.4 @@ -251,6 +282,16 @@ else: el.text = el.text.decode('utf-8') return doc +if sys.version_info < (2, 7): + # Here comes the crazy part: In 2.6, if the xpath is a unicode, + # .//node does not match if a node is a direct child of . ! + def compat_xpath(xpath): + if isinstance(xpath, compat_str): + xpath = xpath.encode('ascii') + return xpath +else: + compat_xpath = lambda xpath: xpath + try: from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 @@ -326,6 +367,9 @@ def compat_ord(c): return ord(c) +compat_os_name = os._name if os.name == 'java' else os.name + + if sys.version_info >= (3, 0): compat_getenv = os.getenv compat_expanduser = os.path.expanduser @@ -346,7 +390,7 @@ else: # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib # for different platforms with correct environment variables decoding. - if os.name == 'posix': + if compat_os_name == 'posix': def compat_expanduser(path): """Expand ~ and ~user constructions. If user or $HOME is unknown, do nothing.""" @@ -370,7 +414,7 @@ else: userhome = pwent.pw_dir userhome = userhome.rstrip('/') return (userhome + path[i:]) or '/' - elif os.name == 'nt' or os.name == 'ce': + elif compat_os_name == 'nt' or compat_os_name == 'ce': def compat_expanduser(path): """Expand ~ and ~user constructs. @@ -540,6 +584,7 @@ else: from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ + 'compat_HTMLParser', 'compat_HTTPError', 'compat_basestring', 'compat_chr', @@ -556,6 +601,7 @@ __all__ = [ 'compat_itertools_count', 'compat_kwargs', 'compat_ord', + 'compat_os_name', 'compat_parse_qs', 'compat_print', 'compat_shlex_split', @@ -568,6 +614,7 @@ __all__ = [ 'compat_urllib_parse_unquote', 'compat_urllib_parse_unquote_plus', 'compat_urllib_parse_unquote_to_bytes', + 'compat_urllib_parse_urlencode', 'compat_urllib_parse_urlparse', 'compat_urllib_request', 'compat_urllib_request_DataHandler', @@ -575,6 +622,7 @@ __all__ = [ 'compat_urlparse', 'compat_urlretrieve', 'compat_xml_parse_error', + 'compat_xpath', 'shlex_quote', 'subprocess_check_output', 'workaround_optparse_bug9161', diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index dccc59212..73b34fdae 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -1,14 +1,16 @@ from __future__ import unicode_literals from .common import FileDownloader -from .external import get_external_downloader from .f4m import F4mFD from .hls import HlsFD -from .hls import NativeHlsFD from .http import HttpFD -from .rtsp import RtspFD from .rtmp import RtmpFD from .dash import DashSegmentsFD +from .rtsp import RtspFD +from .external import ( + get_external_downloader, + FFmpegFD, +) from ..utils import ( determine_protocol, @@ -16,8 +18,8 @@ from ..utils import ( PROTOCOL_MAP = { 'rtmp': RtmpFD, - 'm3u8_native': NativeHlsFD, - 'm3u8': HlsFD, + 'm3u8_native': HlsFD, + 'm3u8': FFmpegFD, 'mms': RtspFD, 'rtsp': RtspFD, 'f4m': F4mFD, @@ -30,14 +32,17 @@ def get_suitable_downloader(info_dict, params={}): protocol = determine_protocol(info_dict) info_dict['protocol'] = protocol + # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): + # return FFmpegFD + external_downloader = params.get('external_downloader') if external_downloader is not None: ed = get_external_downloader(external_downloader) - if ed.supports(info_dict): + if ed.can_download(info_dict): return ed if protocol == 'm3u8' and params.get('hls_prefer_native'): - return NativeHlsFD + return HlsFD return PROTOCOL_MAP.get(protocol, HttpFD) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 2d5154051..1dba9f49a 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -5,6 +5,7 @@ import re import sys import time +from ..compat import compat_os_name from ..utils import ( encodeFilename, error_to_compat_str, @@ -114,6 +115,10 @@ class FileDownloader(object): return '%10s' % '---b/s' return '%10s' % ('%s/s' % format_bytes(speed)) + @staticmethod + def format_retries(retries): + return 'inf' if retries == float('inf') else '%.0f' % retries + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) @@ -219,7 +224,7 @@ class FileDownloader(object): if self.params.get('progress_with_newline', False): self.to_screen(fullmsg) else: - if os.name == 'nt': + if compat_os_name == 'nt': prev_len = getattr(self, '_report_progress_prev_line_length', 0) if prev_len > len(fullmsg): @@ -296,7 +301,9 @@ class FileDownloader(object): def report_retry(self, count, retries): """Report retry in case of HTTP error 5xx""" - self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %.0f)...' % (count, retries)) + self.to_screen( + '[download] Got server HTTP error. Retrying (attempt %d of %s)...' + % (count, self.format_retries(retries))) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8b1b17c6e..8bbab9dbc 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -4,6 +4,7 @@ import os import re from .fragment import FragmentFD +from ..compat import compat_urllib_error from ..utils import ( sanitize_open, encodeFilename, @@ -36,20 +37,41 @@ class DashSegmentsFD(FragmentFD): segments_filenames = [] - def append_url_to_file(target_url, target_filename): - success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) - if not success: + fragment_retries = self.params.get('fragment_retries', 0) + + def append_url_to_file(target_url, tmp_filename, segment_name): + target_filename = '%s-%s' % (tmp_filename, segment_name) + count = 0 + while count <= fragment_retries: + try: + success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) + if not success: + return False + down, target_sanitized = sanitize_open(target_filename, 'rb') + ctx['dest_stream'].write(down.read()) + down.close() + segments_filenames.append(target_sanitized) + break + except (compat_urllib_error.HTTPError, ) as err: + # YouTube may often return 404 HTTP error for a fragment causing the + # whole download to fail. However if the same fragment is immediately + # retried with the same request data this usually succeeds (1-2 attemps + # is usually enough) thus allowing to download the whole file successfully. + # So, we will retry all fragments that fail with 404 HTTP error for now. + if err.code != 404: + raise + # Retry fragment + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(segment_name, count, fragment_retries) + if count > fragment_retries: + self.report_error('giving up after %s fragment retries' % fragment_retries) return False - down, target_sanitized = sanitize_open(target_filename, 'rb') - ctx['dest_stream'].write(down.read()) - down.close() - segments_filenames.append(target_sanitized) if initialization_url: - append_url_to_file(initialization_url, ctx['tmpfilename'] + '-Init') + append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init') for i, segment_url in enumerate(segment_urls): - segment_filename = '%s-Seg%d' % (ctx['tmpfilename'], i) - append_url_to_file(segment_url, segment_filename) + append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i) self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 2bc011266..30277dc20 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -2,8 +2,11 @@ from __future__ import unicode_literals import os.path import subprocess +import sys +import re from .common import FileDownloader +from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS from ..utils import ( cli_option, cli_valueless_option, @@ -11,6 +14,8 @@ from ..utils import ( cli_configuration_args, encodeFilename, encodeArgument, + handle_youtubedl_headers, + check_executable, ) @@ -45,10 +50,18 @@ class ExternalFD(FileDownloader): def exe(self): return self.params.get('external_downloader') + @classmethod + def available(cls): + return check_executable(cls.get_basename(), [cls.AVAILABLE_OPT]) + @classmethod def supports(cls, info_dict): return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') + @classmethod + def can_download(cls, info_dict): + return cls.available() and cls.supports(info_dict) + def _option(self, command_option, param): return cli_option(self.params, command_option, param) @@ -76,6 +89,8 @@ class ExternalFD(FileDownloader): class CurlFD(ExternalFD): + AVAILABLE_OPT = '-V' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): @@ -89,6 +104,8 @@ class CurlFD(ExternalFD): class AxelFD(ExternalFD): + AVAILABLE_OPT = '-V' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): @@ -99,6 +116,8 @@ class AxelFD(ExternalFD): class WgetFD(ExternalFD): + AVAILABLE_OPT = '--version' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] for key, val in info_dict['http_headers'].items(): @@ -112,6 +131,8 @@ class WgetFD(ExternalFD): class Aria2cFD(ExternalFD): + AVAILABLE_OPT = '-v' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-c'] cmd += self._configuration_args([ @@ -130,12 +151,112 @@ class Aria2cFD(ExternalFD): class HttpieFD(ExternalFD): + @classmethod + def available(cls): + return check_executable('http', ['--version']) + def _make_cmd(self, tmpfilename, info_dict): cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] for key, val in info_dict['http_headers'].items(): cmd += ['%s:%s' % (key, val)] return cmd + +class FFmpegFD(ExternalFD): + @classmethod + def supports(cls, info_dict): + return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') + + @classmethod + def available(cls): + return FFmpegPostProcessor().available + + def _call_downloader(self, tmpfilename, info_dict): + url = info_dict['url'] + ffpp = FFmpegPostProcessor(downloader=self) + if not ffpp.available: + self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') + return False + ffpp.check_version() + + args = [ffpp.executable, '-y'] + + args += self._configuration_args() + + # start_time = info_dict.get('start_time') or 0 + # if start_time: + # args += ['-ss', compat_str(start_time)] + # end_time = info_dict.get('end_time') + # if end_time: + # args += ['-t', compat_str(end_time - start_time)] + + if info_dict['http_headers'] and re.match(r'^https?://', url): + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) + args += [ + '-headers', + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + + protocol = info_dict.get('protocol') + + if protocol == 'rtmp': + player_url = info_dict.get('player_url') + page_url = info_dict.get('page_url') + app = info_dict.get('app') + play_path = info_dict.get('play_path') + tc_url = info_dict.get('tc_url') + flash_version = info_dict.get('flash_version') + live = info_dict.get('rtmp_live', False) + if player_url is not None: + args += ['-rtmp_swfverify', player_url] + if page_url is not None: + args += ['-rtmp_pageurl', page_url] + if app is not None: + args += ['-rtmp_app', app] + if play_path is not None: + args += ['-rtmp_playpath', play_path] + if tc_url is not None: + args += ['-rtmp_tcurl', tc_url] + if flash_version is not None: + args += ['-rtmp_flashver', flash_version] + if live: + args += ['-rtmp_live', 'live'] + + args += ['-i', url, '-c', 'copy'] + if protocol == 'm3u8': + if self.params.get('hls_use_mpegts', False): + args += ['-f', 'mpegts'] + else: + args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + elif protocol == 'rtmp': + args += ['-f', 'flv'] + else: + args += ['-f', EXT_TO_OUT_FORMATS.get(info_dict['ext'], info_dict['ext'])] + + args = [encodeArgument(opt) for opt in args] + args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + + self._debug_cmd(args) + + proc = subprocess.Popen(args, stdin=subprocess.PIPE) + try: + retval = proc.wait() + except KeyboardInterrupt: + # subprocces.run would send the SIGKILL signal to ffmpeg and the + # mp4 file couldn't be played, but if we ask ffmpeg to quit it + # produces a file that is playable (this is mostly useful for live + # streams). Note that Windows is not affected and produces playable + # files (see https://github.com/rg3/youtube-dl/issues/8300). + if sys.platform != 'win32': + proc.communicate(b'q') + raise + return retval + + +class AVconvFD(FFmpegFD): + pass + _BY_NAME = dict( (klass.get_basename(), klass) for name, klass in globals().items() diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index fc9642905..664d87543 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -223,6 +223,12 @@ def write_metadata_tag(stream, metadata): write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata)) +def remove_encrypted_media(media): + return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and + 'drmAdditionalHeaderSetId' not in e.attrib, + media)) + + def _add_ns(prop): return '{http://ns.adobe.com/f4m/1.0}%s' % prop @@ -244,9 +250,7 @@ class F4mFD(FragmentFD): # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute if 'id' not in e.attrib: self.report_error('Missing ID in f4m DRM') - media = list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and - 'drmAdditionalHeaderSetId' not in e.attrib, - media)) + media = remove_encrypted_media(media) if not media: self.report_error('Unsupported DRM') return media diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 5bc99492b..ba903ae10 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -19,8 +19,17 @@ class HttpQuietDownloader(HttpFD): class FragmentFD(FileDownloader): """ A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + + Available options: + + fragment_retries: Number of times to retry a fragment for HTTP error (DASH only) """ + def report_retry_fragment(self, fragment_name, count, retries): + self.to_screen( + '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %s)...' + % (fragment_name, count, self.format_retries(retries))) + def _prepare_and_start_frag_download(self, ctx): self._prepare_frag_download(ctx) self._start_frag_download(ctx) @@ -99,7 +108,8 @@ class FragmentFD(FileDownloader): state['eta'] = self.calc_eta( start, time_now, estimated_size, state['downloaded_bytes']) - state['speed'] = s.get('speed') + state['speed'] = s.get('speed') or ctx.get('speed') + ctx['speed'] = state['speed'] ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 2a775bf00..a01dac031 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -1,87 +1,19 @@ from __future__ import unicode_literals -import os +import os.path import re -import subprocess -import sys -from .common import FileDownloader from .fragment import FragmentFD from ..compat import compat_urlparse -from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..utils import ( - encodeArgument, encodeFilename, sanitize_open, - handle_youtubedl_headers, ) -class HlsFD(FileDownloader): - def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - - ffpp = FFmpegPostProcessor(downloader=self) - if not ffpp.available: - self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') - return False - ffpp.check_version() - - args = [ffpp.executable, '-y'] - - if info_dict['http_headers'] and re.match(r'^https?://', url): - # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: - # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - headers = handle_youtubedl_headers(info_dict['http_headers']) - args += [ - '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] - - args += ['-i', url, '-c', 'copy'] - if self.params.get('hls_use_mpegts', False): - args += ['-f', 'mpegts'] - else: - args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] - - args = [encodeArgument(opt) for opt in args] - args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) - - self._debug_cmd(args) - - proc = subprocess.Popen(args, stdin=subprocess.PIPE) - try: - retval = proc.wait() - except KeyboardInterrupt: - # subprocces.run would send the SIGKILL signal to ffmpeg and the - # mp4 file couldn't be played, but if we ask ffmpeg to quit it - # produces a file that is playable (this is mostly useful for live - # streams). Note that Windows is not affected and produces playable - # files (see https://github.com/rg3/youtube-dl/issues/8300). - if sys.platform != 'win32': - proc.communicate(b'q') - raise - if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - }) - return True - else: - self.to_stderr('\n') - self.report_error('%s exited with code %d' % (ffpp.basename, retval)) - return False - - -class NativeHlsFD(FragmentFD): - """ A more limited implementation that does not require ffmpeg """ +class HlsFD(FragmentFD): + """ A limited implementation that does not require ffmpeg """ FD_NAME = 'hlsnative' diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a15572d7e..18d8dbcd6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,975 +1,33 @@ from __future__ import unicode_literals -from .abc import ABCIE -from .abc7news import Abc7NewsIE -from .academicearth import AcademicEarthCourseIE -from .acast import ( - ACastIE, - ACastChannelIE, -) -from .addanime import AddAnimeIE -from .adobetv import ( - AdobeTVIE, - AdobeTVShowIE, - AdobeTVChannelIE, - AdobeTVVideoIE, -) -from .adultswim import AdultSwimIE -from .aenetworks import AENetworksIE -from .aftonbladet import AftonbladetIE -from .airmozilla import AirMozillaIE -from .aljazeera import AlJazeeraIE -from .alphaporno import AlphaPornoIE -from .animeondemand import AnimeOnDemandIE -from .anitube import AnitubeIE -from .anysex import AnySexIE -from .aol import AolIE -from .allocine import AllocineIE -from .aparat import AparatIE -from .appleconnect import AppleConnectIE -from .appletrailers import ( - AppleTrailersIE, - AppleTrailersSectionIE, -) -from .archiveorg import ArchiveOrgIE -from .ard import ( - ARDIE, - ARDMediathekIE, - SportschauIE, -) -from .arkenaplay import ArkenaPlayIE -from .arte import ( - ArteTvIE, - ArteTVPlus7IE, - ArteTVCreativeIE, - ArteTVConcertIE, - ArteTVFutureIE, - ArteTVCinemaIE, - ArteTVDDCIE, - ArteTVMagazineIE, - ArteTVEmbedIE, -) -from .atresplayer import AtresPlayerIE -from .atttechchannel import ATTTechChannelIE -from .audimedia import AudiMediaIE -from .audiomack import AudiomackIE, AudiomackAlbumIE -from .azubu import AzubuIE, AzubuLiveIE -from .baidu import BaiduVideoIE -from .bambuser import BambuserIE, BambuserChannelIE -from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbc import ( - BBCCoUkIE, - BBCCoUkArticleIE, - BBCIE, -) -from .beeg import BeegIE -from .behindkink import BehindKinkIE -from .beatportpro import BeatportProIE -from .bet import BetIE -from .bigflix import BigflixIE -from .bild import BildIE -from .bilibili import BiliBiliIE -from .bleacherreport import ( - BleacherReportIE, - BleacherReportCMSIE, -) -from .blinkx import BlinkxIE -from .bloomberg import BloombergIE -from .bpb import BpbIE -from .br import BRIE -from .breakcom import BreakIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .buzzfeed import BuzzFeedIE -from .byutv import BYUtvIE -from .c56 import C56IE -from .camdemy import ( - CamdemyIE, - CamdemyFolderIE -) -from .canalplus import CanalplusIE -from .canalc2 import Canalc2IE -from .canvas import CanvasIE -from .cbc import ( - CBCIE, - CBCPlayerIE, -) -from .cbs import CBSIE -from .cbsnews import ( - CBSNewsIE, - CBSNewsLiveVideoIE, -) -from .cbssports import CBSSportsIE -from .ccc import CCCIE -from .ceskatelevize import CeskaTelevizeIE -from .channel9 import Channel9IE -from .chaturbate import ChaturbateIE -from .chilloutzone import ChilloutzoneIE -from .chirbit import ( - ChirbitIE, - ChirbitProfileIE, -) -from .cinchcast import CinchcastIE -from .cinemassacre import CinemassacreIE -from .clipfish import ClipfishIE -from .cliphunter import CliphunterIE -from .clipsyndicate import ClipsyndicateIE -from .cloudy import CloudyIE -from .clubic import ClubicIE -from .clyp import ClypIE -from .cmt import CMTIE -from .cnet import CNETIE -from .cnn import ( - CNNIE, - CNNBlogsIE, - CNNArticleIE, -) -from .collegehumor import CollegeHumorIE -from .collegerama import CollegeRamaIE -from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE -from .comcarcoff import ComCarCoffIE -from .commonmistakes import CommonMistakesIE, UnicodeBOMIE -from .condenast import CondeNastIE -from .cracked import CrackedIE -from .crackle import CrackleIE -from .criterion import CriterionIE -from .crooksandliars import CrooksAndLiarsIE -from .crunchyroll import ( - CrunchyrollIE, - CrunchyrollShowPlaylistIE -) -from .cspan import CSpanIE -from .ctsnews import CtsNewsIE -from .cultureunplugged import CultureUnpluggedIE -from .cwtv import CWTVIE -from .dailymotion import ( - DailymotionIE, - DailymotionPlaylistIE, - DailymotionUserIE, - DailymotionCloudIE, -) -from .daum import ( - DaumIE, - DaumClipIE, - DaumPlaylistIE, - DaumUserIE, -) -from .dbtv import DBTVIE -from .dcn import ( - DCNIE, - DCNVideoIE, - DCNLiveIE, - DCNSeasonIE, -) -from .dctp import DctpTvIE -from .deezer import DeezerPlaylistIE -from .democracynow import DemocracynowIE -from .dfb import DFBIE -from .dhm import DHMIE -from .dotsub import DotsubIE -from .douyutv import DouyuTVIE -from .dplay import DPlayIE -from .dramafever import ( - DramaFeverIE, - DramaFeverSeriesIE, -) -from .dreisat import DreiSatIE -from .drbonanza import DRBonanzaIE -from .drtuber import DrTuberIE -from .drtv import DRTVIE -from .dvtv import DVTVIE -from .dump import DumpIE -from .dumpert import DumpertIE -from .defense import DefenseGouvFrIE -from .discovery import DiscoveryIE -from .dropbox import DropboxIE -from .eagleplatform import EaglePlatformIE -from .ebaumsworld import EbaumsWorldIE -from .echomsk import EchoMskIE -from .ehow import EHowIE -from .eighttracks import EightTracksIE -from .einthusan import EinthusanIE -from .eitb import EitbIE -from .ellentv import ( - EllenTVIE, - EllenTVClipsIE, -) -from .elpais import ElPaisIE -from .embedly import EmbedlyIE -from .engadget import EngadgetIE -from .eporner import EpornerIE -from .eroprofile import EroProfileIE -from .escapist import EscapistIE -from .espn import ESPNIE -from .esri import EsriVideoIE -from .europa import EuropaIE -from .everyonesmixtape import EveryonesMixtapeIE -from .exfm import ExfmIE -from .expotv import ExpoTVIE -from .extremetube import ExtremeTubeIE -from .facebook import ( - FacebookIE, - FacebookPostIE, -) -from .faz import FazIE -from .fc2 import FC2IE -from .fczenit import FczenitIE -from .firstpost import FirstpostIE -from .firsttv import FirstTVIE -from .fivemin import FiveMinIE -from .fivetv import FiveTVIE -from .fktv import FKTVIE -from .flickr import FlickrIE -from .folketinget import FolketingetIE -from .footyroom import FootyRoomIE -from .fourtube import FourTubeIE -from .fox import FOXIE -from .foxgay import FoxgayIE -from .foxnews import FoxNewsIE -from .foxsports import FoxSportsIE -from .franceculture import ( - FranceCultureIE, - FranceCultureEmissionIE, -) -from .franceinter import FranceInterIE -from .francetv import ( - PluzzIE, - FranceTvInfoIE, - FranceTVIE, - GenerationQuoiIE, - CultureboxIE, -) -from .freesound import FreesoundIE -from .freespeech import FreespeechIE -from .freevideo import FreeVideoIE -from .funimation import FunimationIE -from .funnyordie import FunnyOrDieIE -from .gameinformer import GameInformerIE -from .gamekings import GamekingsIE -from .gameone import ( - GameOneIE, - GameOnePlaylistIE, -) -from .gamersyde import GamersydeIE -from .gamespot import GameSpotIE -from .gamestar import GameStarIE -from .gametrailers import GametrailersIE -from .gazeta import GazetaIE -from .gdcvault import GDCVaultIE -from .generic import GenericIE -from .gfycat import GfycatIE -from .giantbomb import GiantBombIE -from .giga import GigaIE -from .glide import GlideIE -from .globo import ( - GloboIE, - GloboArticleIE, -) -from .godtube import GodTubeIE -from .goldenmoustache import GoldenMoustacheIE -from .golem import GolemIE -from .googledrive import GoogleDriveIE -from .googleplus import GooglePlusIE -from .googlesearch import GoogleSearchIE -from .goshgay import GoshgayIE -from .gputechconf import GPUTechConfIE -from .groupon import GrouponIE -from .hark import HarkIE -from .hearthisat import HearThisAtIE -from .heise import HeiseIE -from .hellporno import HellPornoIE -from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE -from .historicfilms import HistoricFilmsIE -from .hitbox import HitboxIE, HitboxLiveIE -from .hornbunny import HornBunnyIE -from .hotnewhiphop import HotNewHipHopIE -from .hotstar import HotStarIE -from .howcast import HowcastIE -from .howstuffworks import HowStuffWorksIE -from .huffpost import HuffPostIE -from .hypem import HypemIE -from .iconosquare import IconosquareIE -from .ign import ( - IGNIE, - OneUPIE, - PCMagIE, -) -from .imdb import ( - ImdbIE, - ImdbListIE -) -from .imgur import ( - ImgurIE, - ImgurAlbumIE, -) -from .ina import InaIE -from .indavideo import ( - IndavideoIE, - IndavideoEmbedIE, -) -from .infoq import InfoQIE -from .instagram import InstagramIE, InstagramUserIE -from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import IPrimaIE -from .iqiyi import IqiyiIE -from .ir90tv import Ir90TvIE -from .ivi import ( - IviIE, - IviCompilationIE -) -from .ivideon import IvideonIE -from .izlesene import IzleseneIE -from .jadorecettepub import JadoreCettePubIE -from .jeuxvideo import JeuxVideoIE -from .jove import JoveIE -from .jwplatform import JWPlatformIE -from .jpopsukitv import JpopsukiIE -from .kaltura import KalturaIE -from .kanalplay import KanalPlayIE -from .kankan import KankanIE -from .karaoketv import KaraoketvIE -from .karrierevideos import KarriereVideosIE -from .keezmovies import KeezMoviesIE -from .khanacademy import KhanAcademyIE -from .kickstarter import KickStarterIE -from .keek import KeekIE -from .konserthusetplay import KonserthusetPlayIE -from .kontrtube import KontrTubeIE -from .krasview import KrasViewIE -from .ku6 import Ku6IE -from .kuwo import ( - KuwoIE, - KuwoAlbumIE, - KuwoChartIE, - KuwoSingerIE, - KuwoCategoryIE, - KuwoMvIE, -) -from .la7 import LA7IE -from .laola1tv import Laola1TvIE -from .lcp import LcpIE -from .lecture2go import Lecture2GoIE -from .lemonde import LemondeIE -from .letv import ( - LetvIE, - LetvTvIE, - LetvPlaylistIE, - LetvCloudIE, -) -from .libsyn import LibsynIE -from .lifenews import ( - LifeNewsIE, - LifeEmbedIE, -) -from .limelight import ( - LimelightMediaIE, - LimelightChannelIE, - LimelightChannelListIE, -) -from .liveleak import LiveLeakIE -from .livestream import ( - LivestreamIE, - LivestreamOriginalIE, - LivestreamShortenerIE, -) -from .lnkgo import LnkGoIE -from .lovehomeporn import LoveHomePornIE -from .lrt import LRTIE -from .lynda import ( - LyndaIE, - LyndaCourseIE -) -from .m6 import M6IE -from .macgamestore import MacGameStoreIE -from .mailru import MailRuIE -from .makertv import MakerTVIE -from .malemotion import MalemotionIE -from .matchtv import MatchTVIE -from .mdr import MDRIE -from .metacafe import MetacafeIE -from .metacritic import MetacriticIE -from .mgoon import MgoonIE -from .minhateca import MinhatecaIE -from .ministrygrid import MinistryGridIE -from .miomio import MioMioIE -from .mit import TechTVMITIE, MITIE, OCWMITIE -from .mitele import MiTeleIE -from .mixcloud import MixcloudIE -from .mlb import MLBIE -from .mpora import MporaIE -from .moevideo import MoeVideoIE -from .mofosex import MofosexIE -from .mojvideo import MojvideoIE -from .moniker import MonikerIE -from .mooshare import MooshareIE -from .morningstar import MorningstarIE -from .motherless import MotherlessIE -from .motorsport import MotorsportIE -from .movieclips import MovieClipsIE -from .moviezine import MoviezineIE -from .mtv import ( - MTVIE, - MTVServicesEmbeddedIE, - MTVIggyIE, - MTVDEIE, -) -from .muenchentv import MuenchenTVIE -from .musicplayon import MusicPlayOnIE -from .muzu import MuzuTVIE -from .mwave import MwaveIE -from .myspace import MySpaceIE, MySpaceAlbumIE -from .myspass import MySpassIE -from .myvi import MyviIE -from .myvideo import MyVideoIE -from .myvidster import MyVidsterIE -from .nationalgeographic import NationalGeographicIE -from .naver import NaverIE -from .nba import NBAIE -from .nbc import ( - NBCIE, - NBCNewsIE, - NBCSportsIE, - NBCSportsVPlayerIE, - MSNBCIE, -) -from .ndr import ( - NDRIE, - NJoyIE, - NDREmbedBaseIE, - NDREmbedIE, - NJoyEmbedIE, -) -from .ndtv import NDTVIE -from .netzkino import NetzkinoIE -from .nerdcubed import NerdCubedFeedIE -from .nerdist import NerdistIE -from .neteasemusic import ( - NetEaseMusicIE, - NetEaseMusicAlbumIE, - NetEaseMusicSingerIE, - NetEaseMusicListIE, - NetEaseMusicMvIE, - NetEaseMusicProgramIE, - NetEaseMusicDjRadioIE, -) -from .newgrounds import NewgroundsIE -from .newstube import NewstubeIE -from .nextmedia import ( - NextMediaIE, - NextMediaActionNewsIE, - AppleDailyIE, -) -from .nextmovie import NextMovieIE -from .nfb import NFBIE -from .nfl import NFLIE -from .nhl import ( - NHLIE, - NHLNewsIE, - NHLVideocenterIE, -) -from .nick import NickIE -from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninegag import NineGagIE -from .noco import NocoIE -from .normalboots import NormalbootsIE -from .nosvideo import NosVideoIE -from .nova import NovaIE -from .novamov import ( - NovaMovIE, - WholeCloudIE, - NowVideoIE, - VideoWeedIE, - CloudTimeIE, -) -from .nowness import ( - NownessIE, - NownessPlaylistIE, - NownessSeriesIE, -) -from .nowtv import ( - NowTVIE, - NowTVListIE, -) -from .noz import NozIE -from .npo import ( - NPOIE, - NPOLiveIE, - NPORadioIE, - NPORadioFragmentIE, - SchoolTVIE, - VPROIE, - WNLIE -) -from .npr import NprIE -from .nrk import ( - NRKIE, - NRKPlaylistIE, - NRKTVIE, -) -from .ntvde import NTVDeIE -from .ntvru import NTVRuIE -from .nytimes import ( - NYTimesIE, - NYTimesArticleIE, -) -from .nuvid import NuvidIE -from .odnoklassniki import OdnoklassnikiIE -from .oktoberfesttv import OktoberfestTVIE -from .onionstudios import OnionStudiosIE -from .ooyala import ( - OoyalaIE, - OoyalaExternalIE, -) -from .ora import OraTVIE -from .orf import ( - ORFTVthekIE, - ORFOE1IE, - ORFFM4IE, - ORFIPTVIE, -) -from .pandoratv import PandoraTVIE -from .parliamentliveuk import ParliamentLiveUKIE -from .patreon import PatreonIE -from .pbs import PBSIE -from .periscope import PeriscopeIE -from .philharmoniedeparis import PhilharmonieDeParisIE -from .phoenix import PhoenixIE -from .photobucket import PhotobucketIE -from .pinkbike import PinkbikeIE -from .planetaplay import PlanetaPlayIE -from .pladform import PladformIE -from .played import PlayedIE -from .playfm import PlayFMIE -from .plays import PlaysTVIE -from .playtvak import PlaytvakIE -from .playvid import PlayvidIE -from .playwire import PlaywireIE -from .pluralsight import ( - PluralsightIE, - PluralsightCourseIE, -) -from .podomatic import PodomaticIE -from .porn91 import Porn91IE -from .pornhd import PornHdIE -from .pornhub import ( - PornHubIE, - PornHubPlaylistIE, - PornHubUserVideosIE, -) -from .pornotube import PornotubeIE -from .pornovoisines import PornoVoisinesIE -from .pornoxo import PornoXOIE -from .primesharetv import PrimeShareTVIE -from .promptfile import PromptFileIE -from .prosiebensat1 import ProSiebenSat1IE -from .puls4 import Puls4IE -from .pyvideo import PyvideoIE -from .qqmusic import ( - QQMusicIE, - QQMusicSingerIE, - QQMusicAlbumIE, - QQMusicToplistIE, - QQMusicPlaylistIE, -) -from .quickvid import QuickVidIE -from .r7 import R7IE -from .radiode import RadioDeIE -from .radiojavan import RadioJavanIE -from .radiobremen import RadioBremenIE -from .radiofrance import RadioFranceIE -from .rai import ( - RaiTVIE, - RaiIE, -) -from .rbmaradio import RBMARadioIE -from .rds import RDSIE -from .redtube import RedTubeIE -from .regiotv import RegioTVIE -from .restudy import RestudyIE -from .reverbnation import ReverbNationIE -from .revision3 import Revision3IE -from .ringtv import RingTVIE -from .ro220 import Ro220IE -from .rottentomatoes import RottenTomatoesIE -from .roxwel import RoxwelIE -from .rtbf import RTBFIE -from .rte import RteIE, RteRadioIE -from .rtlnl import RtlNlIE -from .rtl2 import RTL2IE -from .rtp import RTPIE -from .rts import RTSIE -from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE -from .rtvnh import RTVNHIE -from .ruhd import RUHDIE -from .ruleporn import RulePornIE -from .rutube import ( - RutubeIE, - RutubeChannelIE, - RutubeEmbedIE, - RutubeMovieIE, - RutubePersonIE, -) -from .rutv import RUTVIE -from .ruutu import RuutuIE -from .sandia import SandiaIE -from .safari import ( - SafariIE, - SafariCourseIE, -) -from .sapo import SapoIE -from .savefrom import SaveFromIE -from .sbs import SBSIE -from .scivee import SciVeeIE -from .screencast import ScreencastIE -from .screencastomatic import ScreencastOMaticIE -from .screenjunkies import ScreenJunkiesIE -from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE -from .senateisvp import SenateISVPIE -from .servingsys import ServingSysIE -from .sexu import SexuIE -from .sexykarma import SexyKarmaIE -from .shahid import ShahidIE -from .shared import SharedIE -from .sharesix import ShareSixIE -from .sina import SinaIE -from .skynewsarabia import ( - SkyNewsArabiaIE, - SkyNewsArabiaArticleIE, -) -from .slideshare import SlideshareIE -from .slutload import SlutloadIE -from .smotri import ( - SmotriIE, - SmotriCommunityIE, - SmotriUserIE, - SmotriBroadcastIE, -) -from .snagfilms import ( - SnagFilmsIE, - SnagFilmsEmbedIE, -) -from .snotr import SnotrIE -from .sohu import SohuIE -from .soundcloud import ( - SoundcloudIE, - SoundcloudSetIE, - SoundcloudUserIE, - SoundcloudPlaylistIE, - SoundcloudSearchIE -) -from .soundgasm import ( - SoundgasmIE, - SoundgasmProfileIE -) -from .southpark import ( - SouthParkIE, - SouthParkDeIE, - SouthParkDkIE, - SouthParkEsIE, - SouthParkNlIE -) -from .space import SpaceIE -from .spankbang import SpankBangIE -from .spankwire import SpankwireIE -from .spiegel import SpiegelIE, SpiegelArticleIE -from .spiegeltv import SpiegeltvIE -from .spike import SpikeIE -from .stitcher import StitcherIE -from .sport5 import Sport5IE -from .sportbox import ( - SportBoxIE, - SportBoxEmbedIE, -) -from .sportdeutschland import SportDeutschlandIE -from .srgssr import ( - SRGSSRIE, - SRGSSRPlayIE, -) -from .srmediathek import SRMediathekIE -from .ssa import SSAIE -from .stanfordoc import StanfordOpenClassroomIE -from .steam import SteamIE -from .streamcloud import StreamcloudIE -from .streamcz import StreamCZIE -from .streetvoice import StreetVoiceIE -from .sunporno import SunPornoIE -from .svt import ( - SVTIE, - SVTPlayIE, -) -from .swrmediathek import SWRMediathekIE -from .syfy import SyfyIE -from .sztvhu import SztvHuIE -from .tagesschau import TagesschauIE -from .tapely import TapelyIE -from .tass import TassIE -from .teachertube import ( - TeacherTubeIE, - TeacherTubeUserIE, -) -from .teachingchannel import TeachingChannelIE -from .teamcoco import TeamcocoIE -from .techtalks import TechTalksIE -from .ted import TEDIE -from .tele13 import Tele13IE -from .telebruxelles import TeleBruxellesIE -from .telecinco import TelecincoIE -from .telegraaf import TelegraafIE -from .telemb import TeleMBIE -from .teletask import TeleTaskIE -from .tenplay import TenPlayIE -from .testurl import TestURLIE -from .tf1 import TF1IE -from .theintercept import TheInterceptIE -from .theonion import TheOnionIE -from .theplatform import ( - ThePlatformIE, - ThePlatformFeedIE, -) -from .thesixtyone import TheSixtyOneIE -from .thisamericanlife import ThisAmericanLifeIE -from .thisav import ThisAVIE -from .tinypic import TinyPicIE -from .tlc import TlcDeIE -from .tmz import ( - TMZIE, - TMZArticleIE, -) -from .tnaflix import ( - TNAFlixIE, - EMPFlixIE, - MovieFapIE, -) -from .toggle import ToggleIE -from .thvideo import ( - THVideoIE, - THVideoPlaylistIE -) -from .toutv import TouTvIE -from .toypics import ToypicsUserIE, ToypicsIE -from .traileraddict import TrailerAddictIE -from .trilulilu import TriluliluIE -from .trollvids import TrollvidsIE -from .trutube import TruTubeIE -from .tube8 import Tube8IE -from .tubitv import TubiTvIE -from .tudou import ( - TudouIE, - TudouPlaylistIE, - TudouAlbumIE, -) -from .tumblr import TumblrIE -from .tunein import ( - TuneInClipIE, - TuneInStationIE, - TuneInProgramIE, - TuneInTopicIE, - TuneInShortenerIE, -) -from .turbo import TurboIE -from .tutv import TutvIE -from .tv2 import ( - TV2IE, - TV2ArticleIE, -) -from .tv4 import TV4IE -from .tvc import ( - TVCIE, - TVCArticleIE, -) -from .tvigle import TvigleIE -from .tvland import TVLandIE -from .tvp import TvpIE, TvpSeriesIE -from .tvplay import TVPlayIE -from .tweakers import TweakersIE -from .twentyfourvideo import TwentyFourVideoIE -from .twentymin import TwentyMinutenIE -from .twentytwotracks import ( - TwentyTwoTracksIE, - TwentyTwoTracksGenreIE -) -from .twitch import ( - TwitchVideoIE, - TwitchChapterIE, - TwitchVodIE, - TwitchProfileIE, - TwitchPastBroadcastsIE, - TwitchBookmarksIE, - TwitchStreamIE, -) -from .twitter import ( - TwitterCardIE, - TwitterIE, - TwitterAmplifyIE, -) -from .ubu import UbuIE -from .udemy import ( - UdemyIE, - UdemyCourseIE -) -from .udn import UDNEmbedIE -from .digiteka import DigitekaIE -from .unistra import UnistraIE -from .urort import UrortIE -from .ustream import UstreamIE, UstreamChannelIE -from .varzesh3 import Varzesh3IE -from .vbox7 import Vbox7IE -from .veehd import VeeHDIE -from .veoh import VeohIE -from .vessel import VesselIE -from .vesti import VestiIE -from .vevo import VevoIE -from .vgtv import ( - BTArticleIE, - BTVestlendingenIE, - VGTVIE, -) -from .vh1 import VH1IE -from .vice import ViceIE -from .viddler import ViddlerIE -from .videodetective import VideoDetectiveIE -from .videofyme import VideofyMeIE -from .videomega import VideoMegaIE -from .videomore import ( - VideomoreIE, - VideomoreVideoIE, - VideomoreSeasonIE, -) -from .videopremium import VideoPremiumIE -from .videott import VideoTtIE -from .vidme import ( - VidmeIE, - VidmeUserIE, - VidmeUserLikesIE, -) -from .vidzi import VidziIE -from .vier import VierIE, VierVideosIE -from .viewster import ViewsterIE -from .viidea import ViideaIE -from .vimeo import ( - VimeoIE, - VimeoAlbumIE, - VimeoChannelIE, - VimeoGroupsIE, - VimeoLikesIE, - VimeoReviewIE, - VimeoUserIE, - VimeoWatchLaterIE, -) -from .vimple import VimpleIE -from .vine import ( - VineIE, - VineUserIE, -) -from .viki import ( - VikiIE, - VikiChannelIE, -) -from .vk import ( - VKIE, - VKUserVideosIE, -) -from .vlive import VLiveIE -from .vodlocker import VodlockerIE -from .voicerepublic import VoiceRepublicIE -from .vporn import VpornIE -from .vrt import VRTIE -from .vube import VubeIE -from .vuclip import VuClipIE -from .vulture import VultureIE -from .walla import WallaIE -from .washingtonpost import WashingtonPostIE -from .wat import WatIE -from .wayofthemaster import WayOfTheMasterIE -from .wdr import ( - WDRIE, - WDRMobileIE, - WDRMausIE, -) -from .webofstories import ( - WebOfStoriesIE, - WebOfStoriesPlaylistIE, -) -from .weibo import WeiboIE -from .weiqitv import WeiqiTVIE -from .wimp import WimpIE -from .wistia import WistiaIE -from .worldstarhiphop import WorldStarHipHopIE -from .wrzuta import WrzutaIE -from .wsj import WSJIE -from .xbef import XBefIE -from .xboxclips import XboxClipsIE -from .xfileshare import XFileShareIE -from .xhamster import ( - XHamsterIE, - XHamsterEmbedIE, -) -from .xminus import XMinusIE -from .xnxx import XNXXIE -from .xstream import XstreamIE -from .xtube import XTubeUserIE, XTubeIE -from .xuite import XuiteIE -from .xvideos import XVideosIE -from .xxxymovies import XXXYMoviesIE -from .yahoo import ( - YahooIE, - YahooSearchIE, -) -from .yam import YamIE -from .yandexmusic import ( - YandexMusicTrackIE, - YandexMusicAlbumIE, - YandexMusicPlaylistIE, -) -from .yesjapan import YesJapanIE -from .yinyuetai import YinYueTaiIE -from .ynet import YnetIE -from .youjizz import YouJizzIE -from .youku import YoukuIE -from .youporn import YouPornIE -from .yourupload import YourUploadIE -from .youtube import ( - YoutubeIE, - YoutubeChannelIE, - YoutubeFavouritesIE, - YoutubeHistoryIE, - YoutubePlaylistIE, - YoutubeRecommendedIE, - YoutubeSearchDateIE, - YoutubeSearchIE, - YoutubeSearchURLIE, - YoutubeShowIE, - YoutubeSubscriptionsIE, - YoutubeTruncatedIDIE, - YoutubeTruncatedURLIE, - YoutubeUserIE, - YoutubePlaylistsIE, - YoutubeWatchLaterIE, -) -from .zapiks import ZapiksIE -from .zdf import ZDFIE, ZDFChannelIE -from .zingmp3 import ( - ZingMp3SongIE, - ZingMp3AlbumIE, -) -from .zippcast import ZippCastIE +try: + from .lazy_extractors import * + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True +except ImportError: + _LAZY_LOADER = False + from .extractors import * -_ALL_CLASSES = [ - klass - for name, klass in globals().items() - if name.endswith('IE') and name != 'GenericIE' -] -_ALL_CLASSES.append(GenericIE) + _ALL_CLASSES = [ + klass + for name, klass in globals().items() + if name.endswith('IE') and name != 'GenericIE' + ] + _ALL_CLASSES.append(GenericIE) + + +def gen_extractor_classes(): + """ Return a list of supported extractors. + The order does matter; the first extractor matched is the one handling the URL. + """ + return _ALL_CLASSES def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. """ - return [klass() for klass in _ALL_CLASSES] + return [klass() for klass in gen_extractor_classes()] def list_extractors(age_limit): diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 6a29e587f..b584277be 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -12,7 +12,7 @@ from ..utils import ( class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' + _VALID_URL = r'https?://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py index 122dc9099..c04949c21 100644 --- a/youtube_dl/extractor/abc7news.py +++ b/youtube_dl/extractor/abc7news.py @@ -44,6 +44,7 @@ class Abc7NewsIE(InfoExtractor): 'contentURL', webpage, 'm3u8 url', fatal=True) formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') + self._sort_formats(formats) title = self._og_search_title(webpage).strip() description = self._og_search_description(webpage).strip() diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 92eee8119..94ce88c83 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -2,10 +2,14 @@ from __future__ import unicode_literals import re +import functools from .common import InfoExtractor from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + int_or_none, + OnDemandPagedList, +) class ACastIE(InfoExtractor): @@ -26,13 +30,8 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - - embed_page = self._download_webpage( - re.sub('(?:www\.)?acast\.com', 'embedcdn.acast.com', url), display_id) - cast_data = self._parse_json(self._search_regex( - r'window\[\'acast/queries\'\]\s*=\s*([^;]+);', embed_page, 'acast data'), - display_id)['GetAcast/%s/%s' % (channel, display_id)] - + cast_data = self._download_json( + 'https://embed.acast.com/api/acasts/%s/%s' % (channel, display_id), display_id) return { 'id': compat_str(cast_data['id']), 'display_id': display_id, @@ -58,15 +57,26 @@ class ACastChannelIE(InfoExtractor): 'playlist_mincount': 20, } _API_BASE_URL = 'https://www.acast.com/api/' + _PAGE_SIZE = 10 @classmethod def suitable(cls, url): return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) - def _real_extract(self, url): - display_id = self._match_id(url) - channel_data = self._download_json(self._API_BASE_URL + 'channels/%s' % display_id, display_id) - casts = self._download_json(self._API_BASE_URL + 'channels/%s/acasts' % display_id, display_id) - entries = [self.url_result('https://www.acast.com/%s/%s' % (display_id, cast['url']), 'ACast') for cast in casts] + def _fetch_page(self, channel_slug, page): + casts = self._download_json( + self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page), + channel_slug, note='Download page %d of channel data' % page) + for cast in casts: + yield self.url_result( + 'https://www.acast.com/%s/%s' % (channel_slug, cast['url']), + 'ACast', cast['id']) - return self.playlist_result(entries, compat_str(channel_data['id']), channel_data['name'], channel_data.get('description')) + def _real_extract(self, url): + channel_slug = self._match_id(url) + channel_data = self._download_json( + self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, channel_slug), self._PAGE_SIZE) + return self.playlist_result(entries, compat_str( + channel_data['id']), channel_data['name'], channel_data.get('description')) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index e3e6d2113..55a9322a7 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -6,7 +6,7 @@ from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -16,7 +16,7 @@ from ..utils import ( class AddAnimeIE(InfoExtractor): - _VALID_URL = r'http://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' + _VALID_URL = r'https?://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' _TESTS = [{ 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', 'md5': '72954ea10bc979ab5e2eb288b21425a0', @@ -60,7 +60,7 @@ class AddAnimeIE(InfoExtractor): confirm_url = ( parsed_url.scheme + '://' + parsed_url.netloc + action + '?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) self._download_webpage( confirm_url, video_id, diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 6018ae79a..1bbfe2641 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -1,13 +1,19 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + smuggle_url, + update_url_query, + unescapeHTML, +) class AENetworksIE(InfoExtractor): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:[^/]+/)+(?P[^/]+?)(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P[^/]+)/(?:[^/]+/)+(?P[^/]+?)(?:$|[?#])' _TESTS = [{ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', @@ -16,6 +22,9 @@ class AENetworksIE(InfoExtractor): 'ext': 'mp4', 'title': "Bet You Didn't Know: Valentine's Day", 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', + 'timestamp': 1375819729, + 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -25,15 +34,15 @@ class AENetworksIE(InfoExtractor): 'expected_warnings': ['JSON-LD'], }, { 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', + 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', 'info_dict': { 'id': 'eg47EERs_JsZ', 'ext': 'mp4', 'title': 'Winter Is Coming', 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'timestamp': 1338306241, + 'upload_date': '20120529', + 'uploader': 'AENE-NEW', }, 'add_ie': ['ThePlatform'], }, { @@ -48,7 +57,7 @@ class AENetworksIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + page_type, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) @@ -56,11 +65,23 @@ class AENetworksIE(InfoExtractor): r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, r"media_url\s*=\s*'([^']+)'" ] - video_url = self._search_regex(video_url_re, webpage, 'video url') + video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url')) + query = {'mbr': 'true'} + if page_type == 'shows': + query['assetTypes'] = 'medium_video_s3' + if 'switch=hds' in video_url: + query['switch'] = 'hls' info = self._search_json_ld(webpage, video_id, fatal=False) info.update({ '_type': 'url_transparent', - 'url': smuggle_url(video_url, {'sig': {'key': 'crazyjava', 'secret': 's3cr3t'}}), + 'url': smuggle_url( + update_url_query(video_url, query), + { + 'sig': { + 'key': 'crazyjava', + 'secret': 's3cr3t'}, + 'force_smil_url': True + }), }) return info diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index e0518cf26..d548592fe 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' + _VALID_URL = r'https?://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' _TEST = { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 5b2c0dc9a..b081695d8 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'http://www\.aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' + _VALID_URL = r'https?://www\.aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', @@ -13,24 +13,18 @@ class AlJazeeraIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Slum - Episode 1: Deliverance', 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader': 'Al Jazeera English', + 'uploader_id': '665003303001', + 'timestamp': 1411116829, + 'upload_date': '20140919', }, - 'add_ie': ['BrightcoveLegacy'], + 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' def _real_extract(self, url): program_name = self._match_id(url) webpage = self._download_webpage(url, program_name) brightcove_id = self._search_regex( r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - - return { - '_type': 'url', - 'url': ( - 'brightcove:' - 'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc' - '&%40videoPlayer={0}'.format(brightcove_id) - ), - 'ie_key': 'BrightcoveLegacy', - } + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 69e6baff7..138fa0808 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -69,12 +69,14 @@ class AMPIE(InfoExtractor): self._sort_formats(formats) + timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + return { 'id': video_id, 'title': get_media_node('title'), 'description': get_media_node('description'), 'thumbnails': thumbnails, - 'timestamp': parse_iso8601(item.get('pubDate'), ' '), + 'timestamp': timestamp, 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), 'subtitles': subtitles, 'formats': formats, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index a7d8daf7b..9b01e38f5 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_str, +) from ..utils import ( determine_ext, - encode_dict, + extract_attributes, ExtractorError, sanitized_Request, urlencode_postdata, @@ -18,7 +21,7 @@ class AnimeOnDemandIE(InfoExtractor): _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' - _TEST = { + _TESTS = [{ 'url': 'https://www.anime-on-demand.de/anime/161', 'info_dict': { 'id': '161', @@ -26,7 +29,19 @@ class AnimeOnDemandIE(InfoExtractor): 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', }, 'playlist_mincount': 4, - } + }, { + # Film wording is used instead of Episode + 'url': 'https://www.anime-on-demand.de/anime/39', + 'only_matching': True, + }, { + # Episodes without titles + 'url': 'https://www.anime-on-demand.de/anime/162', + 'only_matching': True, + }, { + # ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/169', + 'only_matching': True, + }] def _login(self): (username, password) = self._get_login_info() @@ -36,6 +51,10 @@ class AnimeOnDemandIE(InfoExtractor): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') + if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page: + self.raise_geo_restricted( + '%s is only available in German-speaking countries of Europe' % self.IE_NAME) + login_form = self._form_hidden_inputs('new_user', login_page) login_form.update({ @@ -51,7 +70,7 @@ class AnimeOnDemandIE(InfoExtractor): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) request = sanitized_Request( - post_url, urlencode_postdata(encode_dict(login_form))) + post_url, urlencode_postdata(login_form)) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( @@ -91,14 +110,22 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] - for episode_html in re.findall(r'(?s)]+class="episodebox-title".+?>Episodeninhalt<', webpage): - m = re.search( - r'class="episodebox-title"[^>]+title="Episode (?P\d+) - (?P.+?)"', episode_html) - if not m: + for num, episode_html in enumerate(re.findall( + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1): + episodebox_title = self._search_regex( + (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', + r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), + episode_html, 'episodebox title', default=None, group='title') + if not episodebox_title: continue - episode_number = int(m.group('number')) - episode_title = m.group('title') + episode_number = int(self._search_regex( + r'(?:Episode|Film)\s*(\d+)', + episodebox_title, 'episode number', default=num)) + episode_title = self._search_regex( + r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', + episodebox_title, 'episode title', default=None) + video_id = 'episode-%d' % episode_number common_info = { @@ -110,33 +137,86 @@ class AnimeOnDemandIE(InfoExtractor): formats = [] - playlist_url = self._search_regex( - r'data-playlist=(["\'])(?P<url>.+?)\1', - episode_html, 'data playlist', default=None, group='url') - if playlist_url: - request = sanitized_Request( - compat_urlparse.urljoin(url, playlist_url), - headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRF-Token': csrf_token, - 'Referer': url, - 'Accept': 'application/json, text/javascript, */*; q=0.01', - }) + for input_ in re.findall( + r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html): + attributes = extract_attributes(input_) + playlist_urls = [] + for playlist_key in ('data-playlist', 'data-otherplaylist'): + playlist_url = attributes.get(playlist_key) + if isinstance(playlist_url, compat_str) and re.match( + r'/?[\da-zA-Z]+', playlist_url): + playlist_urls.append(attributes[playlist_key]) + if not playlist_urls: + continue - playlist = self._download_json( - request, video_id, 'Downloading playlist JSON', fatal=False) - if playlist: - playlist = playlist['playlist'][0] - title = playlist['title'] + lang = attributes.get('data-lang') + lang_note = attributes.get('value') + + for playlist_url in playlist_urls: + kind = self._search_regex( + r'videomaterialurl/\d+/([^/]+)/', + playlist_url, 'media kind', default=None) + format_id_list = [] + if lang: + format_id_list.append(lang) + if kind: + format_id_list.append(kind) + if not format_id_list: + format_id_list.append(compat_str(num)) + format_id = '-'.join(format_id_list) + format_note = ', '.join(filter(None, (kind, lang_note))) + request = sanitized_Request( + compat_urlparse.urljoin(url, playlist_url), + headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRF-Token': csrf_token, + 'Referer': url, + 'Accept': 'application/json, text/javascript, */*; q=0.01', + }) + playlist = self._download_json( + request, video_id, 'Downloading %s playlist JSON' % format_id, + fatal=False) + if not playlist: + continue + start_video = playlist.get('startvideo', 0) + playlist = playlist.get('playlist') + if not playlist or not isinstance(playlist, list): + continue + playlist = playlist[start_video] + title = playlist.get('title') + if not title: + continue description = playlist.get('description') for source in playlist.get('sources', []): file_ = source.get('file') - if file_ and determine_ext(file_) == 'm3u8': - formats = self._extract_m3u8_formats( + if not file_: + continue + ext = determine_ext(file_) + format_id_list = [lang, kind] + if ext == 'm3u8': + format_id_list.append('hls') + elif source.get('type') == 'video/dash' or ext == 'mpd': + format_id_list.append('dash') + format_id = '-'.join(filter(None, format_id_list)) + if ext == 'm3u8': + file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') + entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) + elif source.get('type') == 'video/dash' or ext == 'mpd': + continue + file_formats = self._extract_mpd_formats( + file_, video_id, mpd_id=format_id, fatal=False) + else: + continue + for f in file_formats: + f.update({ + 'language': lang, + 'format_note': format_note, + }) + formats.extend(file_formats) if formats: + self._sort_formats(formats) f = common_info.copy() f.update({ 'title': title, @@ -145,16 +225,18 @@ class AnimeOnDemandIE(InfoExtractor): }) entries.append(f) - m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', - episode_html) - if m: - f = common_info.copy() - f.update({ - 'id': '%s-teaser' % f['id'], - 'title': m.group('title'), - 'url': compat_urlparse.urljoin(url, m.group('href')), - }) - entries.append(f) + # Extract teaser only when full episode is not available + if not formats: + m = re.search( + r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', + episode_html) + if m: + f = common_info.copy() + f.update({ + 'id': '%s-teaser' % f['id'], + 'title': m.group('title'), + 'url': compat_urlparse.urljoin(url, m.group('href')), + }) + entries.append(f) return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b51eafc45..d4801a25b 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -1,24 +1,18 @@ +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'''(?x) - (?: - aol-video:| - http://on\.aol\.com/ - (?: - video/.*-| - playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid= - ) - ) - (?P<id>[0-9]+) - (?:$|\?) - ''' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P<id>[^/?-]+)' _TESTS = [{ 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', @@ -27,44 +21,99 @@ class AolIE(InfoExtractor): 'id': '518167793', 'ext': 'mp4', 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', + 'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.', + 'timestamp': 1395405060, + 'upload_date': '20140321', + 'uploader': 'Newsy Studio', }, - 'add_ie': ['FiveMin'], + 'params': { + # m3u8 download + 'skip_download': True, + } }, { - 'url': 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316', + 'url': 'http://on.aol.com/video/netflix-is-raising-rates-5707d6b8e4b090497b04f706?context=PC:homepage:PL1944:1460189336183', 'info_dict': { - 'id': '152147', - 'title': 'Brace Yourself - Today\'s Weirdest News', + 'id': '5707d6b8e4b090497b04f706', + 'ext': 'mp4', + 'title': 'Netflix is Raising Rates', + 'description': 'Netflix is rewarding millions of it’s long-standing members with an increase in cost. Veuer’s Carly Figueroa has more.', + 'upload_date': '20160408', + 'timestamp': 1460123280, + 'uploader': 'Veuer', }, - 'playlist_mincount': 10, + 'params': { + # m3u8 download + 'skip_download': True, + } }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - playlist_id = mobj.group('playlist_id') - if not playlist_id or self._downloader.params.get('noplaylist'): - return self.url_result('5min:%s' % video_id) + video_id = self._match_id(url) - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + response = self._download_json( + 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, + video_id)['response'] + if response['statusText'] != 'Ok': + raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True) - webpage = self._download_webpage(url, playlist_id) - title = self._html_search_regex( - r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title') - playlist_html = self._search_regex( - r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage, - 'playlist HTML') - entries = [{ - '_type': 'url', - 'url': 'aol-video:%s' % m.group('id'), - 'ie_key': 'Aol', - } for m in re.finditer( - r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>", - playlist_html)] + video_data = response['data'] + formats = [] + m3u8_url = video_data.get('videoMasterPlaylist') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + for rendition in video_data.get('renditions', []): + video_url = rendition.get('url') + if not video_url: + continue + ext = rendition.get('format') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + f = { + 'url': video_url, + 'format_id': rendition.get('quality'), + } + mobj = re.search(r'(\d+)x(\d+)', video_url) + if mobj: + f.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + formats.append(f) + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) return { - '_type': 'playlist', - 'id': playlist_id, - 'display_id': mobj.group('playlist_display_id'), - 'title': title, - 'entries': entries, + 'id': video_id, + 'title': video_data['title'], + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('publishDate')), + 'view_count': int_or_none(video_data.get('views')), + 'description': video_data.get('description'), + 'uploader': video_data.get('videoOwner'), + 'formats': formats, } + + +class AolFeaturesIE(InfoExtractor): + IE_NAME = 'features.aol.com' + _VALID_URL = r'https?://features\.aol\.com/video/(?P<id>[^/?#]+)' + + _TESTS = [{ + 'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts', + 'md5': '7db483bb0c09c85e241f84a34238cc75', + 'info_dict': { + 'id': '519507715', + 'ext': 'mp4', + 'title': 'What To Watch - February 17, 2016', + }, + 'add_ie': ['FiveMin'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return self.url_result(self._search_regex( + r'<script type="text/javascript" src="(https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js[^"]+)"', + webpage, '5min embed url'), 'FiveMin') diff --git a/youtube_dl/extractor/arkenaplay.py b/youtube_dl/extractor/arkenaplay.py index 0061ea196..db01d67bd 100644 --- a/youtube_dl/extractor/arkenaplay.py +++ b/youtube_dl/extractor/arkenaplay.py @@ -148,4 +148,4 @@ class ArkenaPlayIE(InfoExtractor): formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) - return formats \ No newline at end of file + return formats diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index efde7e207..ae0f27dcb 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -23,7 +23,7 @@ from ..utils import ( class ArteTvIE(InfoExtractor): - _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' + _VALID_URL = r'https?://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' IE_NAME = 'arte.tv' def _real_extract(self, url): @@ -121,15 +121,18 @@ class ArteTVPlus7IE(InfoExtractor): json_url = compat_parse_qs( compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] if json_url: - return self._extract_from_json_url(json_url, video_id, lang) - # Differend kind of embed URL (e.g. + title = self._search_regex( + r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', + webpage, 'title', default=None, group='title') + return self._extract_from_json_url(json_url, video_id, lang, title=title) + # Different kind of embed URL (e.g. # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) embed_url = self._search_regex( r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', webpage, 'embed url', group='url') return self.url_result(embed_url) - def _extract_from_json_url(self, json_url, video_id, lang): + def _extract_from_json_url(self, json_url, video_id, lang, title=None): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] @@ -137,7 +140,7 @@ class ArteTVPlus7IE(InfoExtractor): if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - title = player_info['VTI'].strip() + title = (player_info.get('VTI') or title or player_info['VID']).strip() subtitle = player_info.get('VSU', '').strip() if subtitle: title += ' - %s' % subtitle diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index b8f9ae005..d2f388964 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -6,16 +6,14 @@ import hashlib import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from ..compat import compat_str from ..utils import ( - int_or_none, - float_or_none, - sanitized_Request, - xpath_text, ExtractorError, + float_or_none, + int_or_none, + sanitized_Request, + urlencode_postdata, + xpath_text, ) @@ -86,7 +84,7 @@ class AtresPlayerIE(InfoExtractor): } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') response = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index 3b2effa15..aa6925623 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -10,9 +10,9 @@ from ..utils import ( class AudiMediaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audimedia\.tv/(?:en|de)/vid/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P<id>[^/?#]+)' _TEST = { - 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test', + 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { 'id': '1565', @@ -32,7 +32,10 @@ class AudiMediaIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - raw_payload = self._search_regex(r'<script[^>]+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload') + raw_payload = self._search_regex([ + r'class="amtv-embed"[^>]+id="([^"]+)"', + r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"', + ], webpage, 'raw payload') _, stage_mode, video_id, lang = raw_payload.split('-') # TODO: handle s and e stage_mode (live streams and ended live streams) @@ -59,13 +62,19 @@ class AudiMediaIE(InfoExtractor): video_version_url = video_version.get('download_url') or video_version.get('stream_url') if not video_version_url: continue - formats.append({ + f = { 'url': video_version_url, 'width': int_or_none(video_version.get('width')), 'height': int_or_none(video_version.get('height')), 'abr': int_or_none(video_version.get('audio_bitrate')), 'vbr': int_or_none(video_version.get('video_bitrate')), - }) + } + bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None) + if bitrate: + f.update({ + 'format_id': 'http-%s' % bitrate, + }) + formats.append(f) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py new file mode 100644 index 000000000..2ec2d7092 --- /dev/null +++ b/youtube_dl/extractor/audioboom.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none + + +class AudioBoomIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0', + 'md5': '63a8d73a055c6ed0f1e51921a10a5a76', + 'info_dict': { + 'id': '4279833', + 'ext': 'mp3', + 'title': '3/09/2016 Czaban Hour 3', + 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans', + 'duration': 2245.72, + 'uploader': 'Steve Czaban', + 'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + clip = None + + clip_store = self._parse_json( + self._search_regex( + r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id, + webpage, 'clip store', default='{}', group='json'), + video_id, fatal=False) + if clip_store: + clips = clip_store.get('clips') + if clips and isinstance(clips, list) and isinstance(clips[0], dict): + clip = clips[0] + + def from_clip(field): + if clip: + clip.get(field) + + audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( + 'audio', webpage, 'audio url') + title = from_clip('title') or self._og_search_title(webpage) + description = from_clip('description') or self._og_search_description(webpage) + + duration = float_or_none(from_clip('duration') or self._html_search_meta( + 'weibo:audio:duration', webpage)) + + uploader = from_clip('author') or self._og_search_property( + 'audio:artist', webpage, 'uploader', fatal=False) + uploader_url = from_clip('author_url') or self._html_search_meta( + 'audioboo:channel', webpage, 'uploader url') + + return { + 'id': video_id, + 'url': audio_url, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_url': uploader_url, + } diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index 011edf128..efa624de1 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -98,7 +98,7 @@ class AzubuIE(InfoExtractor): class AzubuLiveIE(InfoExtractor): - _VALID_URL = r'http://www.azubu.tv/(?P<id>[^/]+)$' + _VALID_URL = r'https?://www.azubu.tv/(?P<id>[^/]+)$' _TEST = { 'url': 'http://www.azubu.tv/MarsTVMDLen', @@ -120,6 +120,7 @@ class AzubuLiveIE(InfoExtractor): bc_info = self._download_json(req, user) m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS') formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4') + self._sort_formats(formats) return { 'id': info['id'], diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py index 76b21e596..234a661d3 100644 --- a/youtube_dl/extractor/baidu.py +++ b/youtube_dl/extractor/baidu.py @@ -9,7 +9,7 @@ from ..utils import unescapeHTML class BaiduVideoIE(InfoExtractor): IE_DESC = '百度视频' - _VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' + _VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' _TESTS = [{ 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', 'info_dict': { diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index da986e063..0eb1930c2 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -4,15 +4,13 @@ import re import itertools from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_str, -) +from ..compat import compat_str from ..utils import ( ExtractorError, - int_or_none, float_or_none, + int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -58,7 +56,7 @@ class BambuserIE(InfoExtractor): } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 9d0dfb961..425f08f2b 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -10,7 +10,6 @@ from ..utils import ( int_or_none, parse_duration, parse_iso8601, - remove_end, unescapeHTML, ) from ..compat import ( @@ -329,6 +328,7 @@ class BBCCoUkIE(InfoExtractor): 'format_id': '%s_%s' % (service, format['format_id']), 'abr': abr, 'acodec': acodec, + 'vcodec': 'none', }) formats.extend(conn_formats) return formats @@ -561,7 +561,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', - 'title': 'BBC Blogs - Adam Curtis - BUGGER', + 'title': 'BUGGER', }, 'playlist_count': 18, }, { @@ -670,9 +670,17 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', - 'title': 'What Liverpool can expect from Klopp', + 'title': 'Jurgen Klopp: Furious football from a witty and winning coach', }, 'playlist_count': 3, + }, { + # school report article with single video + 'url': 'http://www.bbc.co.uk/schoolreport/35744779', + 'info_dict': { + 'id': '35744779', + 'title': 'School which breaks down barriers in Jerusalem', + }, + 'playlist_count': 1, }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', @@ -681,6 +689,10 @@ class BBCIE(BBCCoUkIE): # custom redirection to www.bbc.com 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', 'only_matching': True, + }, { + # single video article embedded with data-media-vpid + 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', + 'only_matching': True, }] @classmethod @@ -735,8 +747,17 @@ class BBCIE(BBCCoUkIE): json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) timestamp = json_ld_info.get('timestamp') + playlist_title = json_ld_info.get('title') - playlist_description = json_ld_info.get('description') + if not playlist_title: + playlist_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'<title>(.+?)', webpage, 'playlist title', default=None) + if playlist_title: + playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + + playlist_description = json_ld_info.get( + 'description') or self._og_search_description(webpage, default=None) if not timestamp: timestamp = parse_iso8601(self._search_regex( @@ -797,13 +818,11 @@ class BBCIE(BBCCoUkIE): playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) if entries: - playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News') - playlist_description = playlist_description or self._og_search_description(webpage, default=None) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( - [r'data-video-player-vpid="(%s)"' % self._ID_REGEX, + [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], webpage, 'vpid', default=None) @@ -829,10 +848,6 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } - playlist_title = self._html_search_regex( - r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') - playlist_description = self._og_search_description(webpage, default=None) - def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), @@ -932,7 +947,7 @@ class BBCIE(BBCCoUkIE): class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index 34c2a756f..956c7680e 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -33,8 +33,33 @@ class BeegIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + cpl_url = self._search_regex( + r']+src=(["\'])(?P(?:https?:)?//static\.beeg\.com/cpl/\d+\.js.*?)\1', + webpage, 'cpl', default=None, group='url') + + beeg_version, beeg_salt = [None] * 2 + + if cpl_url: + cpl = self._download_webpage( + self._proto_relative_url(cpl_url), video_id, + 'Downloading cpl JS', fatal=False) + if cpl: + beeg_version = self._search_regex( + r'beeg_version\s*=\s*(\d+)', cpl, + 'beeg version', default=None) or self._search_regex( + r'/(\d+)\.js', cpl_url, 'beeg version', default=None) + beeg_salt = self._search_regex( + r'beeg_salt\s*=\s*(["\'])(?P.+?)\1', cpl, 'beeg beeg_salt', + default=None, group='beeg_salt') + + beeg_version = beeg_version or '1750' + beeg_salt = beeg_salt or 'MIDtGaw96f0N1kMMAM1DE46EC9pmFr' + video = self._download_json( - 'https://api.beeg.com/api/v5/video/%s' % video_id, video_id) + 'http://api.beeg.com/api/v6/%s/video/%s' % (beeg_version, video_id), + video_id) def split(o, e): def cut(s, x): @@ -50,8 +75,8 @@ class BeegIE(InfoExtractor): return n def decrypt_key(key): - # Reverse engineered from http://static.beeg.com/cpl/1105.js - a = '5ShMcIQlssOd7zChAIOlmeTZDaUxULbJRnywYaiB' + # Reverse engineered from http://static.beeg.com/cpl/1738.js + a = beeg_salt e = compat_urllib_parse_unquote(key) o = ''.join([ compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21) @@ -101,5 +126,5 @@ class BeegIE(InfoExtractor): 'duration': duration, 'tags': tags, 'formats': formats, - 'age_limit': 18, + 'age_limit': self._rta_search(webpage), } diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py index 1bdc25812..9bca853b3 100644 --- a/youtube_dl/extractor/behindkink.py +++ b/youtube_dl/extractor/behindkink.py @@ -8,7 +8,7 @@ from ..utils import url_basename class BehindKinkIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/(?P[^/#?_]+)' + _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/(?P[^/#?_]+)' _TEST = { 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', 'md5': '507b57d8fdcd75a41a9a7bdb7989c762', diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index 03dad4636..986245bf0 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -94,6 +94,7 @@ class BetIE(InfoExtractor): xpath_with_ns('./media:thumbnail', NS_MAP)).get('url') formats = self._extract_smil_formats(smil_url, display_id) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 59beb11bc..8baff2041 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -14,7 +14,7 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P\d+)(?:/index_(?P\d+).html)?' + _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P\d+)(?:/index_(?P\d+).html)?' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py new file mode 100644 index 000000000..133228133 --- /dev/null +++ b/youtube_dl/extractor/biobiochiletv.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class BioBioChileTVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?:[^/]+/)+(?P[^/]+)\.shtml' + + _TESTS = [{ + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', + 'md5': '26f51f03cf580265defefb4518faec09', + 'info_dict': { + 'id': 'sobre-camaras-y-camarillas-parlamentarias', + 'ext': 'mp4', + 'title': 'Sobre Cámaras y camarillas parlamentarias', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Fernando Atria', + }, + }, { + # different uploader layout + 'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml', + 'md5': 'edc2e6b58974c46d5b047dea3c539ff3', + 'info_dict': { + 'id': 'natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades', + 'ext': 'mp4', + 'title': 'Natalia Valdebenito repasa a diputado Hasbún: Pasó a la categoría de hablar brutalidades', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Piangella Obrador', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV') + + file_url = self._search_regex( + r'loadFWPlayerVideo\([^,]+,\s*(["\'])(?P.+?)\1', + webpage, 'file url', group='url') + + base_url = self._search_regex( + r'file\s*:\s*(["\'])(?P.+?)\1\s*\+\s*fileURL', webpage, + 'base url', default='http://unlimited2-cl.digitalproserver.com/bbtv/', + group='url') + + formats = self._extract_m3u8_formats( + '%s%s/playlist.m3u8' % (base_url, file_url), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + f = { + 'url': '%s%s' % (base_url, file_url), + 'format_id': 'http', + 'protocol': 'http', + 'preference': 1, + } + if formats: + f_copy = formats[-1].copy() + f_copy.update(f) + f = f_copy + formats.append(f) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + uploader = self._html_search_regex( + r']+href=["\']https?://busca\.biobiochile\.cl/author[^>]+>(.+?)', + webpage, 'uploader', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'formats': formats, + } diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index 38bda3af5..7a8e1f60b 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -28,10 +28,10 @@ class BleacherReportIE(InfoExtractor): 'add_ie': ['Ooyala'], }, { 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', - 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', + 'md5': '6a5cd403418c7b01719248ca97fb0692', 'info_dict': { 'id': '2586817', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', 'timestamp': 1446839961, 'uploader': 'Sean Fay', @@ -93,10 +93,14 @@ class BleacherReportCMSIE(AMPIE): 'md5': '8c2c12e3af7805152675446c905d159b', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py new file mode 100644 index 000000000..86a7f4d7d --- /dev/null +++ b/youtube_dl/extractor/bokecc.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ExtractorError + + +class BokeCCBaseIE(InfoExtractor): + def _extract_bokecc_formats(self, webpage, video_id, format_id=None): + player_params_str = self._html_search_regex( + r'<(?:script|embed)[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', + webpage, 'player params') + + player_params = compat_parse_qs(player_params_str) + + info_xml = self._download_xml( + 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( + player_params['siteid'][0], player_params['vid'][0]), video_id) + + formats = [{ + 'format_id': format_id, + 'url': quality.find('./copy').attrib['playurl'], + 'preference': int(quality.attrib['value']), + } for quality in info_xml.findall('./video/quality')] + + self._sort_formats(formats) + + return formats + + +class BokeCCIE(BokeCCBaseIE): + _IE_DESC = 'CC视频' + _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P.*)' + + _TESTS = [{ + 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B', + 'info_dict': { + 'id': 'CD0C5D3C8614B28B_E44D40C15E65EA30', + 'ext': 'flv', + 'title': 'BokeCC Video', + }, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) + if not qs.get('vid') or not qs.get('uid'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0]) + + webpage = self._download_webpage(url, video_id) + + return { + 'id': video_id, + 'title': 'BokeCC Video', # no title provided in the webpage + 'formats': self._extract_bokecc_formats(webpage, video_id), + } diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index c28e72927..6ad45a1e6 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -12,7 +12,7 @@ from ..utils import ( class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'http://www\.bpb\.de/mediathek/(?P[0-9]+)/' + _VALID_URL = r'https?://www\.bpb\.de/mediathek/(?P[0-9]+)/' _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py new file mode 100644 index 000000000..541c76944 --- /dev/null +++ b/youtube_dl/extractor/bravotv.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class BravoTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+videos/(?P[^/?]+)' + _TEST = { + 'url': 'http://www.bravotv.com/last-chance-kitchen/season-5/videos/lck-ep-12-fishy-finale', + 'md5': 'd60cdf68904e854fac669bd26cccf801', + 'info_dict': { + 'id': 'LitrBdX64qLn', + 'ext': 'mp4', + 'title': 'Last Chance Kitchen Returns', + 'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13', + 'timestamp': 1448926740, + 'upload_date': '20151130', + 'uploader': 'NBCU-BRAV', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid') + release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid') + return self.url_result(smuggle_url( + 'http://link.theplatform.com/s/%s/%s?mbr=true&switch=progressive' % (account_pid, release_pid), + {'force_smil_url': True}), 'ThePlatform', release_pid) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index aa08051b1..725859b4d 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -11,7 +11,7 @@ from ..utils import ( class BreakIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P\d+)' _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', 'info_dict': { diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index c947337f9..c718cf385 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,10 +9,10 @@ from ..compat import ( compat_etree_fromstring, compat_parse_qs, compat_str, - compat_urllib_parse, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, + compat_HTTPError, ) from ..utils import ( determine_ext, @@ -23,16 +23,16 @@ from ..utils import ( js_to_json, int_or_none, parse_iso8601, - sanitized_Request, unescapeHTML, unsmuggle_url, + update_url_query, ) class BrightcoveLegacyIE(InfoExtractor): IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P.*)' - _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' + _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated' _TESTS = [ { @@ -46,6 +46,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', 'uploader': '8TV', 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'timestamp': 1368213670, + 'upload_date': '20130510', + 'uploader_id': '1589608506001', } }, { @@ -57,6 +60,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', 'uploader': 'Oracle', + 'timestamp': 1344975024, + 'upload_date': '20120814', + 'uploader_id': '1460825906', }, }, { @@ -68,6 +74,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'This Bracelet Acts as a Personal Thermostat', 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', 'uploader': 'Mashable', + 'timestamp': 1382041798, + 'upload_date': '20131017', + 'uploader_id': '1130468786001', }, }, { @@ -85,14 +94,17 @@ class BrightcoveLegacyIE(InfoExtractor): { # test flv videos served by akamaihd.net # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william - 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', # The md5 checksum changes on each download 'info_dict': { - 'id': '2996102916001', + 'id': '3750436379001', 'ext': 'flv', 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', - 'uploader': 'Red Bull TV', + 'uploader': 'RBTV Old (do not use)', 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'timestamp': 1409122195, + 'upload_date': '20140827', + 'uploader_id': '710858724001', }, }, { @@ -106,6 +118,12 @@ class BrightcoveLegacyIE(InfoExtractor): 'playlist_mincount': 7, }, ] + FLV_VCODECS = { + 1: 'SORENSON', + 2: 'ON2', + 3: 'H264', + 4: 'VP8', + } @classmethod def _build_brighcove_url(cls, object_str): @@ -136,13 +154,16 @@ class BrightcoveLegacyIE(InfoExtractor): else: flashvars = {} + data_url = object_doc.attrib.get('data', '') + data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query) + def find_param(name): if name in flashvars: return flashvars[name] node = find_xpath_attr(object_doc, './param', 'name', name) if node is not None: return node.attrib['value'] - return None + return data_url_params.get(name) params = {} @@ -155,8 +176,8 @@ class BrightcoveLegacyIE(InfoExtractor): # Not all pages define this value if playerKey is not None: params['playerKey'] = playerKey - # The three fields hold the id of the video - videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') + # These fields hold the id of the video + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') if videoPlayer is not None: params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') @@ -184,8 +205,7 @@ class BrightcoveLegacyIE(InfoExtractor): @classmethod def _make_brightcove_url(cls, params): - data = compat_urllib_parse.urlencode(params) - return cls._FEDERATED_URL_TEMPLATE % data + return update_url_query(cls._FEDERATED_URL, params) @classmethod def _extract_brightcove_url(cls, webpage): @@ -239,7 +259,7 @@ class BrightcoveLegacyIE(InfoExtractor): # We set the original url as the default 'Referer' header referer = smuggled_data.get('Referer', url) return self._get_video_info( - videoPlayer[0], query_str, query, referer=referer) + videoPlayer[0], query, referer=referer) elif 'playerKey' in query: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) @@ -248,15 +268,14 @@ class BrightcoveLegacyIE(InfoExtractor): 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', expected=True) - def _get_video_info(self, video_id, query_str, query, referer=None): - request_url = self._FEDERATED_URL_TEMPLATE % query_str - req = sanitized_Request(request_url) + def _get_video_info(self, video_id, query, referer=None): + headers = {} linkBase = query.get('linkBaseURL') if linkBase is not None: referer = linkBase[0] if referer is not None: - req.add_header('Referer', referer) - webpage = self._download_webpage(req, video_id) + headers['Referer'] = referer + webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query) error_msg = self._html_search_regex( r"

We're sorry.

([\s\n]*

.*?

)+", webpage, @@ -288,15 +307,19 @@ class BrightcoveLegacyIE(InfoExtractor): playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): + publisher_id = video_info.get('publisherId') info = { 'id': compat_str(video_info['id']), 'title': video_info['displayName'].strip(), 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), + 'uploader_id': compat_str(publisher_id) if publisher_id else None, + 'duration': float_or_none(video_info.get('length'), 1000), + 'timestamp': int_or_none(video_info.get('creationDate'), 1000), } - renditions = video_info.get('renditions') + renditions = video_info.get('renditions', []) + video_info.get('IOSRenditions', []) if renditions: formats = [] for rend in renditions: @@ -317,19 +340,42 @@ class BrightcoveLegacyIE(InfoExtractor): ext = 'flv' if ext is None: ext = determine_ext(url) - size = rend.get('size') - formats.append({ + tbr = int_or_none(rend.get('encodingRate'), 1000), + a_format = { + 'format_id': 'http%s' % ('-%s' % tbr if tbr else ''), 'url': url, 'ext': ext, - 'height': rend.get('frameHeight'), - 'width': rend.get('frameWidth'), - 'filesize': size if size != 0 else None, - }) + 'filesize': int_or_none(rend.get('size')) or None, + 'tbr': tbr, + } + if rend.get('audioOnly'): + a_format.update({ + 'vcodec': 'none', + }) + else: + a_format.update({ + 'height': int_or_none(rend.get('frameHeight')), + 'width': int_or_none(rend.get('frameWidth')), + 'vcodec': rend.get('videoCodec'), + }) + + # m3u8 manifests with remote == false are media playlists + # Not calling _extract_m3u8_formats here to save network traffic + if ext == 'm3u8': + a_format.update({ + 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''), + 'ext': 'mp4', + 'protocol': 'm3u8', + }) + + formats.append(a_format) self._sort_formats(formats) info['formats'] = formats elif video_info.get('FLVFullLengthURL') is not None: info.update({ 'url': video_info['FLVFullLengthURL'], + 'vcodec': self.FLV_VCODECS.get(video_info.get('FLVFullCodec')), + 'filesize': int_or_none(video_info.get('FLVFullSize')), }) if self._downloader.params.get('include_ads', False): @@ -355,7 +401,7 @@ class BrightcoveLegacyIE(InfoExtractor): class BrightcoveNewIE(InfoExtractor): IE_NAME = 'brightcove:new' - _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*videoId=(?P(?:ref:)?\d+)' + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*videoId=(?P\d+|ref:[^&]+)' _TESTS = [{ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -385,12 +431,17 @@ class BrightcoveNewIE(InfoExtractor): 'formats': 'mincount:41', }, 'params': { + # m3u8 download 'skip_download': True, } }, { # ref: prefixed video id 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', 'only_matching': True, + }, { + # non numeric ref: prefixed video id + 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356', + 'only_matching': True, }] @staticmethod @@ -410,8 +461,8 @@ class BrightcoveNewIE(InfoExtractor): # Look for iframe embeds [1] for _, url in re.findall( - r']+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): - entries.append(url) + r']+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(url if url.startswith('http') else 'http:' + url) # Look for embed_in_page embeds [2] for video_id, account_id, player_id, embed in re.findall( @@ -420,11 +471,11 @@ class BrightcoveNewIE(InfoExtractor): # According to [4] data-video-id may be prefixed with ref: r'''(?sx) ]+ - data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? + data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*? .*? ]+ src=["\'](?:https?:)?//players\.brightcove\.net/ - (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js + (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js ''', webpage): entries.append( 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' @@ -454,24 +505,33 @@ class BrightcoveNewIE(InfoExtractor): r'policyKey\s*:\s*(["\'])(?P.+?)\1', webpage, 'policy key', group='pk') - req = sanitized_Request( - 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' - % (account_id, video_id), - headers={'Accept': 'application/json;pk=%s' % policy_key}) - json_data = self._download_json(req, video_id) + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) + try: + json_data = self._download_json(api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + json_data = self._parse_json(e.cause.read().decode(), video_id) + raise ExtractorError(json_data[0]['message'], expected=True) + raise - title = json_data['name'] + title = json_data['name'].strip() formats = [] for source in json_data.get('sources', []): + container = source.get('container') source_type = source.get('type') src = source.get('src') - if source_type == 'application/x-mpegURL': + if source_type == 'application/x-mpegURL' or container == 'M2TS': if not src: continue formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + src, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif source_type == 'application/dash+xml': + if not src: + continue + formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) else: streaming_src = source.get('streaming_src') stream_name, app_name = source.get('stream_name'), source.get('app_name') @@ -479,15 +539,23 @@ class BrightcoveNewIE(InfoExtractor): continue tbr = float_or_none(source.get('avg_bitrate'), 1000) height = int_or_none(source.get('height')) + width = int_or_none(source.get('width')) f = { 'tbr': tbr, - 'width': int_or_none(source.get('width')), - 'height': height, 'filesize': int_or_none(source.get('size')), - 'container': source.get('container'), - 'vcodec': source.get('codec'), - 'ext': source.get('container').lower(), + 'container': container, + 'ext': container.lower(), } + if width == 0 and height == 0: + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': width, + 'height': height, + 'vcodec': source.get('codec'), + }) def build_format_id(kind): format_id = kind @@ -501,7 +569,7 @@ class BrightcoveNewIE(InfoExtractor): f.update({ 'url': src or streaming_src, 'format_id': build_format_id('http' if src else 'http-streaming'), - 'preference': 2 if src else 1, + 'source_preference': 0 if src else -1, }) else: f.update({ @@ -512,20 +580,22 @@ class BrightcoveNewIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - description = json_data.get('description') - thumbnail = json_data.get('thumbnail') - timestamp = parse_iso8601(json_data.get('published_at')) - duration = float_or_none(json_data.get('duration'), 1000) - tags = json_data.get('tags', []) + subtitles = {} + for text_track in json_data.get('text_tracks', []): + if text_track.get('src'): + subtitles.setdefault(text_track.get('srclang'), []).append({ + 'url': text_track['src'], + }) return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'description': json_data.get('description'), + 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'duration': float_or_none(json_data.get('duration'), 1000), + 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': account_id, 'formats': formats, - 'tags': tags, + 'subtitles': subtitles, + 'tags': json_data.get('tags', []), } diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py index cb96c3876..cac8fdcba 100644 --- a/youtube_dl/extractor/c56.py +++ b/youtube_dl/extractor/c56.py @@ -4,12 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import js_to_json class C56IE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P.+?)\.(?:html|swf)' IE_NAME = '56.com' - _TEST = { + _TESTS = [{ 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', 'md5': 'e59995ac63d0457783ea05f93f12a866', 'info_dict': { @@ -18,12 +19,29 @@ class C56IE(InfoExtractor): 'title': '网事知多少 第32期:车怒', 'duration': 283.813, }, - } + }, { + 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html', + 'md5': '', + 'info_dict': { + 'id': '82247482', + 'title': '爱的诅咒之杜鹃花开', + }, + 'playlist_count': 7, + 'add_ie': ['Sohu'], + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) text_id = mobj.group('textid') + webpage = self._download_webpage(url, text_id) + sohu_video_info_str = self._search_regex( + r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None) + if sohu_video_info_str: + sohu_video_info = self._parse_json( + sohu_video_info_str, text_id, transform_source=js_to_json) + return self.url_result(sohu_video_info['url'], 'Sohu') + page = self._download_json( 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 897f3a104..6ffbeabd3 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -16,7 +16,7 @@ from ..utils import ( class CamdemyIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?camdemy\.com/media/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P\d+)' _TESTS = [{ # single file 'url': 'http://www.camdemy.com/media/5181/', @@ -104,7 +104,7 @@ class CamdemyIE(InfoExtractor): class CamdemyFolderIE(InfoExtractor): - _VALID_URL = r'http://www.camdemy.com/folder/(?P\d+)' + _VALID_URL = r'https?://www.camdemy.com/folder/(?P\d+)' _TESTS = [{ # links with trailing slash 'url': 'http://www.camdemy.com/folder/450', @@ -139,7 +139,7 @@ class CamdemyFolderIE(InfoExtractor): parsed_url = list(compat_urlparse.urlparse(url)) query = dict(compat_urlparse.parse_qsl(parsed_url[4])) query.update({'displayMode': 'list'}) - parsed_url[4] = compat_urllib_parse.urlencode(query) + parsed_url[4] = compat_urllib_parse_urlencode(query) final_url = compat_urlparse.urlunparse(parsed_url) page = self._download_webpage(final_url, folder_id) diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py new file mode 100644 index 000000000..afbc5ea26 --- /dev/null +++ b/youtube_dl/extractor/camwithher.py @@ -0,0 +1,87 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + unified_strdate, +) + + +class CamWithHerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P\w+)' + + _TESTS = [{ + 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', + 'info_dict': { + 'id': '5644', + 'ext': 'flv', + 'title': 'Periscope Tease', + 'description': 'In the clouds teasing on periscope to my favorite song', + 'duration': 240, + 'view_count': int, + 'comment_count': int, + 'uploader': 'MileenaK', + 'upload_date': '20160322', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + flv_id = self._html_search_regex( + r']+href=["\']/download/\?v=(\d+)', webpage, 'video id') + + # Video URL construction algorithm is reverse-engineered from cwhplayer.swf + rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % ( + ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id) + + title = self._html_search_regex( + r']+style="float:left"[^>]*>\s*

(.+?)

', webpage, 'title') + description = self._html_search_regex( + r'>Description:(.+?)', webpage, 'description', default=None) + + runtime = self._search_regex( + r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None) + if runtime: + runtime = re.sub(r'[\s-]', '', runtime) + duration = parse_duration(runtime) + view_count = int_or_none(self._search_regex( + r'Views\s*:\s*(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None)) + + uploader = self._search_regex( + r'Added by\s*:\s*]+>([^<]+)', webpage, 'uploader', default=None) + upload_date = unified_strdate(self._search_regex( + r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) + + return { + 'id': flv_id, + 'url': rtmp_url, + 'ext': 'flv', + 'no_resume': True, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'uploader': uploader, + 'upload_date': upload_date, + } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 40d07ab18..c621a08d5 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,24 +1,41 @@ from __future__ import unicode_literals -from .common import InfoExtractor +from .theplatform import ThePlatformIE from ..utils import ( - sanitized_Request, - smuggle_url, + xpath_text, + xpath_element, + int_or_none, + ExtractorError, + find_xpath_attr, ) -class CBSIE(InfoExtractor): +class CBSBaseIE(ThePlatformIE): + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') + return { + 'en': [{ + 'ext': 'ttml', + 'url': closed_caption_e.attrib['value'], + }] + } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] + + +class CBSIE(CBSBaseIE): _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { - 'id': '4JUVEwq3wUT7', + 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', 'display_id': 'connect-chat-feat-garth-brooks', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', 'duration': 1495, + 'timestamp': 1385585425, + 'upload_date': '20131127', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download @@ -47,22 +64,46 @@ class CBSIE(InfoExtractor): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] + TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?manifest=m3u&mbr=true' def _real_extract(self, url): display_id = self._match_id(url) - request = sanitized_Request(url) - # Android UA is served with higher quality (720p) streams (see - # https://github.com/rg3/youtube-dl/issues/7490) - request.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.4; Nexus 5)') - webpage = self._download_webpage(request, display_id) - real_id = self._search_regex( - [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"], - webpage, 'real video ID') - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url( - 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true&manifest=m3u' % real_id, - {'force_smil_url': True}), + webpage = self._download_webpage(url, display_id) + content_id = self._search_regex( + [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], + webpage, 'content id') + items_data = self._download_xml( + 'http://can.cbs.com/thunder/player/videoPlayerService.php', + content_id, query={'partner': 'cbs', 'contentId': content_id}) + video_data = xpath_element(items_data, './/item') + title = xpath_text(video_data, 'videoTitle', 'title', True) + + subtitles = {} + formats = [] + for item in items_data.findall('.//item'): + pid = xpath_text(item, 'pid') + if not pid: + continue + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + self.TP_RELEASE_URL_TEMPLATE % pid, content_id, 'Downloading %s SMIL data' % pid) + except ExtractorError: + continue + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + self._sort_formats(formats) + + info = self.get_metadata('dJ5BDC/media/guid/2198311517/%s' % content_id, content_id) + info.update({ + 'id': content_id, 'display_id': display_id, - } + 'title': title, + 'series': xpath_text(video_data, 'seriesTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), + 'thumbnail': xpath_text(video_data, 'previewImageURL'), + 'formats': formats, + 'subtitles': subtitles, + }) + return info diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cbsinteractive.py similarity index 59% rename from youtube_dl/extractor/cnet.py rename to youtube_dl/extractor/cbsinteractive.py index 5c3908f72..0011c3029 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import int_or_none -class CNETIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P[^/]+)/' +class CBSInteractiveIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?(?Pcnet|zdnet)\.com/(?:videos|video/share)/(?P[^/?]+)' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { @@ -17,6 +19,8 @@ class CNETIE(ThePlatformIE): 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', 'duration': 70, + 'timestamp': 1396479627, + 'upload_date': '20140402', }, }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', @@ -28,15 +32,38 @@ class CNETIE(ThePlatformIE): 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'duration': 1482, + 'timestamp': 1433289889, + 'upload_date': '20150603', }, + }, { + 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', + 'info_dict': { + 'id': 'bc1af9f0-a2b5-4e54-880d-0d95525781c0', + 'ext': 'mp4', + 'title': 'Video: Keeping Android smartphones and tablets secure', + 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', + 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', + 'uploader': 'Adrian Kingsley-Hughes', + 'timestamp': 1448961720, + 'upload_date': '20151201', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] + TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' + MPX_ACCOUNTS = { + 'cnet': 2288573011, + 'zdnet': 2387448114, + } def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"data-cnet-video(?:-uvp)?-options='([^']+)'", + r"data-(?:cnet|zdnet)-video(?:-uvp)?-options='([^']+)'", webpage, 'data json') data = self._parse_json(data_json, display_id) vdata = data.get('video') or data['videos'][0] @@ -51,18 +78,15 @@ class CNETIE(ThePlatformIE): uploader = None uploader_id = None - mpx_account = data['config']['uvpConfig']['default']['mpx_account'] - - metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id) - description = vdata.get('description') or metadata.get('description') - duration = int_or_none(vdata.get('duration')) or metadata.get('duration') - - formats = [] - subtitles = {} + media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) + formats, subtitles = [], {} + if site == 'cnet': + formats, subtitles = self._extract_theplatform_smil( + self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue - release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid) + release_url = self.TP_RELEASE_URL_TEMPLATE % vid if fkey == 'hds': release_url += '&manifest=f4m' tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) @@ -70,15 +94,15 @@ class CNETIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) - return { + info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id) + info.update({ 'id': video_id, 'display_id': display_id, 'title': title, - 'description': description, - 'thumbnail': metadata.get('thumbnail'), - 'duration': duration, + 'duration': int_or_none(vdata.get('duration')), 'uploader': uploader, 'uploader_id': uploader_id, 'subtitles': subtitles, 'formats': formats, - } + }) + return info diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 7319ee1b7..79ddc20a0 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -2,16 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .theplatform import ThePlatformIE +from .cbs import CBSBaseIE from ..utils import ( parse_duration, - find_xpath_attr, ) -class CBSNewsIE(ThePlatformIE): +class CBSNewsIE(CBSBaseIE): IE_DESC = 'CBS News' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' _TESTS = [ { @@ -49,15 +48,6 @@ class CBSNewsIE(ThePlatformIE): }, ] - def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') - return { - 'en': [{ - 'ext': 'ttml', - 'url': closed_caption_e.attrib['value'], - }] - } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] - def _real_extract(self, url): video_id = self._match_id(url) @@ -78,7 +68,7 @@ class CBSNewsIE(ThePlatformIE): pid = item.get('media' + format_id) if not pid: continue - release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?format=SMIL&mbr=true' % pid + release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) @@ -96,7 +86,7 @@ class CBSNewsIE(ThePlatformIE): class CBSNewsLiveVideoIE(InfoExtractor): IE_DESC = 'CBS News Live Videos' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/live/video/(?P[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[\da-z_-]+)' _TEST = { 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', @@ -122,6 +112,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): for entry in f4m_formats: # URLs without the extra param induce an 404 error entry.update({'extra_param_to_segment_url': hdcore_sign}) + self._sort_formats(f4m_formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index ae47e74cc..549ae32f3 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class CBSSportsIE(InfoExtractor): - _VALID_URL = r'http://www\.cbssports\.com/video/player/(?P
[^/]+)/(?P[^/]+)' + _VALID_URL = r'https?://www\.cbssports\.com/video/player/(?P
[^/]+)/(?P[^/]+)' _TEST = { 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s', diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py new file mode 100755 index 000000000..498d2c0d8 --- /dev/null +++ b/youtube_dl/extractor/cda.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + parse_duration +) + + +class CDAIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.cda.pl/video/5749950c', + 'md5': '6f844bf51b15f31fae165365707ae970', + 'info_dict': { + 'id': '5749950c', + 'ext': 'mp4', + 'height': 720, + 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'duration': 39 + } + }, { + 'url': 'http://www.cda.pl/video/57413289', + 'md5': 'a88828770a8310fc00be6c95faf7f4d5', + 'info_dict': { + 'id': '57413289', + 'ext': 'mp4', + 'title': 'Lądowanie na lotnisku na Maderze', + 'duration': 137 + } + }, { + 'url': 'http://ebd.cda.pl/0x0/5749950c', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id) + + if 'Ten film jest dostępny dla użytkowników premium' in webpage: + raise ExtractorError('This video is only available for premium users.', expected=True) + + title = self._html_search_regex(r'(.+?)', webpage, 'title') + + formats = [] + + info_dict = { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': None, + } + + def extract_format(page, version): + unpacked = decode_packed_codes(page) + format_url = self._search_regex( + r"url:\\'(.+?)\\'", unpacked, '%s url' % version, fatal=False) + if not format_url: + return + f = { + 'url': format_url, + } + m = re.search( + r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', + page) + if m: + f.update({ + 'format_id': m.group('format_id'), + 'height': int(m.group('height')), + }) + info_dict['formats'].append(f) + if not info_dict['duration']: + info_dict['duration'] = parse_duration(self._search_regex( + r"duration:\\'(.+?)\\'", unpacked, 'duration', fatal=False)) + + extract_format(webpage, 'default') + + for href, resolution in re.findall( + r']+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', + webpage): + webpage = self._download_webpage( + href, video_id, 'Downloading %s version information' % resolution, fatal=False) + if not webpage: + # Manually report warning because empty page is returned when + # invalid version is requested. + self.report_warning('Unable to download %s version information' % resolution) + continue + extract_format(webpage, resolution) + + self._sort_formats(formats) + + return info_dict diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index b27b4e670..6652c8e42 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) @@ -13,6 +12,7 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, + urlencode_postdata, ) @@ -102,7 +102,7 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request( 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', - data=compat_urllib_parse.urlencode(data)) + data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') req.add_header('x-addr', '127.0.0.1') @@ -129,7 +129,8 @@ class CeskaTelevizeIE(InfoExtractor): formats = [] for format_id, stream_url in item['streamUrls'].items(): formats.extend(self._extract_m3u8_formats( - stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native')) + stream_url, playlist_id, 'mp4', + entry_protocol='m3u8_native', fatal=False)) self._sort_formats(formats) item_id = item.get('id') or item['assetId'] diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 242fba311..b2234549e 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -48,6 +48,7 @@ class ChaturbateIE(InfoExtractor): raise ExtractorError('Unable to find stream URL') formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 6d9cd8abd..042c4f2f1 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -21,6 +21,10 @@ class CinemassacreIE(InfoExtractor): 'title': '“Angry Video Game Nerd: The Movie” – Trailer', 'description': 'md5:fb87405fcb42a331742a0dce2708560b', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', @@ -31,14 +35,18 @@ class CinemassacreIE(InfoExtractor): 'upload_date': '20131002', 'title': 'The Mummy’s Hand (1940)', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { # Youtube embedded video 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', - 'md5': 'df4cf8a1dcedaec79a73d96d83b99023', + 'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9', 'info_dict': { 'id': 'OEVzPCY2T-g', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', 'upload_date': '20061207', 'uploader': 'Cinemassacre', @@ -49,12 +57,12 @@ class CinemassacreIE(InfoExtractor): { # Youtube embedded video 'url': 'http://cinemassacre.com/2006/09/01/mckids/', - 'md5': '6eb30961fa795fedc750eac4881ad2e1', + 'md5': '7393c4e0f54602ad110c793eb7a6513a', 'info_dict': { 'id': 'FnxsNhuikpo', - 'ext': 'mp4', + 'ext': 'webm', 'upload_date': '20060901', - 'uploader': 'Cinemassacre Extras', + 'uploader': 'Cinemassacre Extra', 'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53', 'uploader_id': 'Cinemassacre', 'title': 'AVGN: McKids', @@ -69,7 +77,11 @@ class CinemassacreIE(InfoExtractor): 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', 'upload_date': '20150525', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } ] diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 2996b6b09..19f8b397e 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -19,7 +19,7 @@ def _decode(s): class CliphunterIE(InfoExtractor): IE_NAME = 'cliphunter' - _VALID_URL = r'''(?x)http://(?:www\.)?cliphunter\.com/w/ + _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/ (?P[0-9]+)/ (?P.+?)(?:$|[#\?]) ''' diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py new file mode 100644 index 000000000..4f9320ea5 --- /dev/null +++ b/youtube_dl/extractor/cliprs.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, +) + + +class ClipRsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P[^/]+)/\d+' + _TEST = { + 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', + 'md5': 'c412d57815ba07b56f9edc7b5d6a14e5', + 'info_dict': { + 'id': '1488842.1399140381', + 'ext': 'mp4', + 'title': 'PREMIJERA Frajle predstavljaju novi spot za pesmu Moli me, moli', + 'description': 'md5:56ce2c3b4ab31c5a2e0b17cb9a453026', + 'duration': 229, + 'timestamp': 1459850243, + 'upload_date': '20160405', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'id=(["\'])mvp:(?P.+?)\1', webpage, 'mvp id', group='id') + + response = self._download_json( + 'http://qi.ckm.onetapi.pl/', video_id, + query={ + 'body[id]': video_id, + 'body[jsonrpc]': '2.0', + 'body[method]': 'get_asset_detail', + 'body[params][ID_Publikacji]': video_id, + 'body[params][Service]': 'www.onet.pl', + 'content-type': 'application/jsonp', + 'x-onet-app': 'player.front.onetapi.pl', + }) + + error = response.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + + video = response['result'].get('0') + + formats = [] + for _, formats_dict in video['formats'].items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_list in formats_dict.items(): + if not isinstance(format_list, list): + continue + for f in format_list: + if not f.get('url'): + continue + formats.append({ + 'url': f['url'], + 'format_id': format_id, + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'abr': float_or_none(f.get('audio_bitrate')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + self._sort_formats(formats) + + meta = video.get('meta', {}) + + title = self._og_search_title(webpage, default=None) or meta['title'] + description = self._og_search_description(webpage, default=None) or meta.get('description') + duration = meta.get('length') or meta.get('lenght') + timestamp = parse_iso8601(meta.get('addDate'), ' ') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index 8306d6fb7..0b6ad895f 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -8,7 +8,7 @@ from ..utils import ( class ClipsyndicateIE(InfoExtractor): - _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' + _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' _TESTS = [{ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 0fa720ee8..9e267e6c0 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_HTTPError, ) from ..utils import ( @@ -64,7 +64,7 @@ class CloudyIE(InfoExtractor): 'errorUrl': error_url, }) - data_url = self._API_URL % (video_host, compat_urllib_parse.urlencode(form)) + data_url = self._API_URL % (video_host, compat_urllib_parse_urlencode(form)) player_data = self._download_webpage( data_url, video_id, 'Downloading player data') data = compat_parse_qs(player_data) diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 1dfa7c12e..2fba93543 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -12,7 +12,7 @@ from ..utils import ( class ClubicIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P[0-9]+)\.html' _TESTS = [{ 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py new file mode 100644 index 000000000..d354d9f95 --- /dev/null +++ b/youtube_dl/extractor/cnbc.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class CNBCIE(InfoExtractor): + _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P[0-9]+)' + _TEST = { + 'url': 'http://video.cnbc.com/gallery/?video=3000503714', + 'info_dict': { + 'id': '3000503714', + 'ext': 'mp4', + 'title': 'Fighting zombies is big business', + 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', + 'timestamp': 1459332000, + 'upload_date': '20160330', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id, + {'force_smil_url': True}), + 'id': video_id, + } diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 7dff68492..747c245c8 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -11,7 +11,7 @@ from ..utils import ( class ComCarCoffIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' + _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' _TESTS = [{ 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/', 'info_dict': { @@ -41,7 +41,13 @@ class ComCarCoffIE(InfoExtractor): display_id = full_data['activeVideo']['video'] video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id] + video_id = compat_str(video_data['mediaId']) + title = video_data['title'] + formats = self._extract_m3u8_formats( + video_data['mediaUrl'], video_id, 'mp4') + self._sort_formats(formats) + thumbnails = [{ 'url': video_data['images']['thumb'], }, { @@ -54,15 +60,14 @@ class ComCarCoffIE(InfoExtractor): video_data.get('duration')) return { - '_type': 'url_transparent', - 'url': 'crackle:%s' % video_id, 'id': video_id, 'display_id': display_id, - 'title': video_data['title'], + 'title': title, 'description': video_data.get('description'), 'timestamp': timestamp, 'duration': duration, 'thumbnails': thumbnails, + 'formats': formats, 'season_number': int_or_none(video_data.get('season')), 'episode_number': int_or_none(video_data.get('episode')), 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 5b1b99675..0c59102e0 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -5,7 +5,7 @@ import re from .mtv import MTVServicesInfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, @@ -201,7 +201,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): # Correct cc.com in uri uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) - index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri})) + index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse_urlencode({'uri': uri})) idoc = self._download_xml( index_url, epTitle, 'Downloading show index', 'Unable to download episode index') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 14f575635..5269059d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,14 +15,17 @@ import math from ..compat import ( compat_cookiejar, compat_cookies, + compat_etree_fromstring, compat_getpass, compat_http_client, - compat_urllib_error, - compat_urllib_parse, - compat_urlparse, + compat_os_name, compat_str, - compat_etree_fromstring, + compat_urllib_error, + compat_urllib_parse_urlencode, + compat_urllib_request, + compat_urlparse, ) +from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, @@ -47,6 +50,8 @@ from ..utils import ( determine_protocol, parse_duration, mimetype2ext, + update_Request, + update_url_query, ) @@ -104,7 +109,7 @@ class InfoExtractor(object): * protocol The protocol that will be used for the actual download, lower-case. "http", "https", "rtsp", "rtmp", "rtmpe", - "m3u8", or "m3u8_native". + "m3u8", "m3u8_native" or "http_dash_segments". * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. @@ -157,12 +162,14 @@ class InfoExtractor(object): thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. + license: License name the video is licensed under. creator: The main artist who created the video. release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. + uploader_url: Full URL to a personal webpage of the video uploader. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {language: subformats}. "subformats" is a list sorted from @@ -225,6 +232,24 @@ class InfoExtractor(object): episode_number: Number of the video episode within a season, as an integer. episode_id: Id of the video episode, as a unicode string. + The following fields should only be used when the media is a track or a part of + a music album: + + track: Title of the track. + track_number: Number of the track within an album or a disc, as an integer. + track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), + as a unicode string. + artist: Artist(s) of the track. + genre: Genre(s) of the track. + album: Title of the album the track belongs to. + album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). + album_artist: List of all artists appeared on the album (e.g. + "Ash Borer / Fell Voices" or "Various Artists", useful for splits + and compilations). + disc_number: Number of the disc or other physical medium the track belongs to, + as an integer. + release_year: Year (YYYY) when the album was released. + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. @@ -342,7 +367,7 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): """ Returns the response handle """ if note is None: self.report_download_webpage(video_id) @@ -351,6 +376,14 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) + if isinstance(url_or_request, compat_urllib_request.Request): + url_or_request = update_Request( + url_or_request, data=data, headers=headers, query=query) + else: + if query: + url_or_request = update_url_query(url_or_request, query) + if data or headers: + url_or_request = sanitized_Request(url_or_request, data, headers) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -366,13 +399,13 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal return False @@ -425,7 +458,7 @@ class InfoExtractor(object): self.to_screen('Saving request to ' + filename) # Working around MAX_PATH limitation on Windows (see # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if os.name == 'nt': + if compat_os_name == 'nt': absfilepath = os.path.abspath(filename) if len(absfilepath) > 259: filename = '\\\\?\\' + absfilepath @@ -459,13 +492,13 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): """ Returns the data of the page as a string """ success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -480,10 +513,10 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None): + transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) + url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) if xml_string is False: return xml_string if transform_source: @@ -494,10 +527,10 @@ class InfoExtractor(object): note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None): + fatal=True, encoding=None, data=None, headers={}, query={}): json_string = self._download_webpage( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding) + encoding=encoding, data=data, headers=headers, query=query) if (not fatal) and json_string is False: return None return self._parse_json( @@ -594,7 +627,7 @@ class InfoExtractor(object): if mobj: break - if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty(): + if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): _name = '\033[0;34m%s\033[0m' % name else: _name = name @@ -809,7 +842,7 @@ class InfoExtractor(object): for input in re.findall(r'(?i)]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue - name = re.search(r'name=(["\'])(?P.+?)\1', input) + name = re.search(r'(?:name|id)=(["\'])(?P.+?)\1', input) if not name: continue value = re.search(r'value=(["\'])(?P.*?)\1', input) @@ -852,6 +885,7 @@ class InfoExtractor(object): proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 if f.get('vcodec') == 'none': # audio only + preference -= 50 if self._downloader.params.get('prefer_free_formats'): ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] else: @@ -862,6 +896,8 @@ class InfoExtractor(object): except ValueError: audio_ext_preference = -1 else: + if f.get('acodec') == 'none': # video only + preference -= 40 if self._downloader.params.get('prefer_free_formats'): ORDER = ['flv', 'mp4', 'webm'] else: @@ -963,12 +999,24 @@ class InfoExtractor(object): if manifest is False: return [] + return self._parse_f4m_formats( + manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, + transform_source=transform_source, fatal=fatal) + + def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): formats = [] manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') if not media_nodes: manifest_version = '2.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + # Remove unsupported DRM protected media from final formats + # rendition (see https://github.com/rg3/youtube-dl/issues/8573). + media_nodes = remove_encrypted_media(media_nodes) + if not media_nodes: + return formats base_url = xpath_text( manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], 'base URL', default=None) @@ -988,7 +1036,8 @@ class InfoExtractor(object): # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': formats.extend(self._extract_f4m_formats( - manifest_url, video_id, preference, f4m_id, fatal=fatal)) + manifest_url, video_id, preference=preference, f4m_id=f4m_id, + transform_source=transform_source, fatal=fatal)) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -1000,8 +1049,6 @@ class InfoExtractor(object): 'height': int_or_none(media_el.attrib.get('height')), 'preference': preference, }) - self._sort_formats(formats) - return formats def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, @@ -1033,11 +1080,21 @@ class InfoExtractor(object): return [] m3u8_doc, urlh = res m3u8_url = urlh.geturl() - # A Media Playlist Tag MUST NOT appear in a Master Playlist - # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 - # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists - # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 - if '#EXT-X-TARGETDURATION' in m3u8_doc: + + # We should try extracting formats only from master playlists [1], i.e. + # playlists that describe available qualities. On the other hand media + # playlists [2] should be returned as is since they contain just the media + # without qualities renditions. + # Fortunately, master playlist can be easily distinguished from media + # playlist based on particular tags availability. As of [1, 2] master + # playlist tags MUST NOT appear in a media playist and vice versa. + # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist + # and MUST NOT appear in master playlist thus we can clearly detect media + # playlist with this criterion. + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is return [{ 'url': m3u8_url, 'format_id': m3u8_id, @@ -1084,25 +1141,34 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, } - codecs = last_info.get('CODECS') - if codecs: - # TODO: looks like video codec is not always necessarily goes first - va_codecs = codecs.split(',') - if va_codecs[0]: - f['vcodec'] = va_codecs[0] - if len(va_codecs) > 1 and va_codecs[1]: - f['acodec'] = va_codecs[1] resolution = last_info.get('RESOLUTION') if resolution: width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) + codecs = last_info.get('CODECS') + if codecs: + vcodec, acodec = [None] * 2 + va_codecs = codecs.split(',') + if len(va_codecs) == 1: + # Audio only entries usually come with single codec and + # no resolution. For more robustness we also check it to + # be mp4 audio. + if not resolution and va_codecs[0].startswith('mp4a'): + vcodec, acodec = 'none', va_codecs[0] + else: + vcodec = va_codecs[0] + else: + vcodec, acodec = va_codecs[:2] + f.update({ + 'acodec': acodec, + 'vcodec': vcodec, + }) if last_media is not None: f['m3u8_media'] = last_media last_media = None formats.append(f) last_info = {} - self._sort_formats(formats) return formats @staticmethod @@ -1117,8 +1183,8 @@ class InfoExtractor(object): out.append('{%s}%s' % (namespace, c)) return '/'.join(out) - def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) + def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) if smil is False: assert not fatal @@ -1135,10 +1201,10 @@ class InfoExtractor(object): return {} return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) - def _download_smil(self, smil_url, video_id, fatal=True): + def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): return self._download_xml( smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal) + 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) @@ -1259,7 +1325,7 @@ class InfoExtractor(object): 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse.urlencode(f4m_params) + f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) continue @@ -1276,8 +1342,6 @@ class InfoExtractor(object): }) continue - self._sort_formats(formats) - return formats def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): @@ -1288,7 +1352,7 @@ class InfoExtractor(object): if not src or src in urls: continue urls.append(src) - ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type')) + ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, @@ -1424,8 +1488,9 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) - mime_type = representation_attrib.get('mimeType') - content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType') + # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory + mime_type = representation_attrib['mimeType'] + content_type = mime_type.split('/')[0] if content_type == 'text': # TODO implement WebVTT downloading pass @@ -1448,6 +1513,7 @@ class InfoExtractor(object): f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, + 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), @@ -1466,9 +1532,16 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])] + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth')} + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], @@ -1493,7 +1566,6 @@ class InfoExtractor(object): existing_format.update(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - self._sort_formats(formats) return formats def _live_title(self, name): @@ -1600,6 +1672,15 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def mark_watched(self, *args, **kwargs): + if (self._downloader.params.get('mark_watched', False) and + (self._get_login_info()[0] is not None or + self._downloader.params.get('cookiefile') is not None)): + self._mark_watched(*args, **kwargs) + + def _mark_watched(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py new file mode 100644 index 000000000..5d130a170 --- /dev/null +++ b/youtube_dl/extractor/commonprotocols.py @@ -0,0 +1,36 @@ +from __future__ import unicode_literals + +import os + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urlparse, +) +from ..utils import url_basename + + +class RtmpIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'(?i)rtmp[est]?://.+' + + _TESTS = [{ + 'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4', + 'only_matching': True, + }, { + 'url': 'rtmp://edge.live.hitbox.tv/live/dimak', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + title = compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + return { + 'id': video_id, + 'title': title, + 'formats': [{ + 'url': url, + 'ext': 'flv', + 'format_id': compat_urlparse.urlparse(url).scheme, + }], + } diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 6f92ae2ed..e8f2b5a07 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -45,7 +45,7 @@ class CondeNastIE(InfoExtractor): 'wmagazine': 'W Magazine', } - _VALID_URL = r'http://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'https?://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) EMBED_URL = r'(?:https?:)?//player\.(?P%s)\.com/(?Pembed(?:js)?)/.+?' % '|'.join(_SITES.keys()) @@ -97,7 +97,7 @@ class CondeNastIE(InfoExtractor): video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id') player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id') target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target') - data = compat_urllib_parse.urlencode({'videoId': video_id, + data = compat_urllib_parse_urlencode({'videoId': video_id, 'playerId': player_id, 'target': target, }) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index c7032ffa2..8ae3f2890 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -11,8 +11,8 @@ from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, - compat_urllib_parse, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, ) @@ -54,7 +54,7 @@ class CrunchyrollBaseIE(InfoExtractor): def _real_initialize(self): self._login() - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + def _download_webpage(self, url_or_request, *args, **kwargs): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) else sanitized_Request(url_or_request)) # Accept-Language must be set explicitly to accept any language to avoid issues @@ -65,8 +65,7 @@ class CrunchyrollBaseIE(InfoExtractor): # Crunchyroll to not work in georestriction cases in some browsers that don't place # the locale lang first in header. However allowing any language seems to workaround the issue. request.add_header('Accept-Language', '*') - return super(CrunchyrollBaseIE, self)._download_webpage( - request, video_id, note, errnote, fatal, tries, timeout, encoding) + return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) @staticmethod def _add_skip_wall(url): @@ -79,7 +78,7 @@ class CrunchyrollBaseIE(InfoExtractor): # See https://github.com/rg3/youtube-dl/issues/7202. qs['skip_wall'] = ['1'] return compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) class CrunchyrollIE(CrunchyrollBaseIE): @@ -309,7 +308,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) playerdata_req = sanitized_Request(playerdata_url) - playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) + playerdata_req.data = urlencode_postdata({'current_page': webpage_url}) playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') @@ -323,7 +322,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text streamdata_req = sanitized_Request( 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' % (stream_id, stream_format, stream_quality), - compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8')) + compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8')) streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata = self._download_xml( streamdata_req, video_id, diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index b8b9d058d..84b36f44c 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -15,7 +15,7 @@ from .senateisvp import SenateISVPIE class CSpanIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' + _VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py index 45049bf37..1622fc844 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/youtube_dl/extractor/ctsnews.py @@ -8,7 +8,7 @@ from ..utils import parse_iso8601, ExtractorError class CtsNewsIE(InfoExtractor): IE_DESC = '華視新聞' # https connection failed (Connection reset) - _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P\d+)\.html' + _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P\d+)\.html' _TESTS = [{ 'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html', 'md5': 'a9875cb790252b08431186d741beaabe', diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index 36af67013..f5cefd966 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -57,6 +57,7 @@ class CWTVIE(InfoExtractor): formats = self._extract_m3u8_formats( video_data['videos']['variantplaylist']['uri'], video_id, 'mp4') + self._sort_formats(formats) thumbnails = [{ 'url': image['uri'], diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index c84c51058..86024a745 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -8,8 +8,8 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -70,7 +70,7 @@ class DaumIE(InfoExtractor): def _real_extract(self, url): video_id = compat_urllib_parse_unquote(self._match_id(url)) - query = compat_urllib_parse.urlencode({'vid': video_id}) + query = compat_urllib_parse_urlencode({'vid': video_id}) movie_data = self._download_json( 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query, video_id, 'Downloading video formats info') @@ -86,7 +86,7 @@ class DaumIE(InfoExtractor): formats = [] for format_el in movie_data['output_list']['output_list']: profile = format_el['profile'] - format_query = compat_urllib_parse.urlencode({ + format_query = compat_urllib_parse_urlencode({ 'vid': video_id, 'profile': profile, }) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 15a1c40f7..5deff5f30 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -6,7 +6,7 @@ import base64 from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_str, ) from ..utils import ( @@ -15,6 +15,7 @@ from ..utils import ( sanitized_Request, smuggle_url, unsmuggle_url, + urlencode_postdata, ) @@ -106,7 +107,7 @@ class DCNVideoIE(DCNBaseIE): webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'id': video_data['id'], 'user_id': video_data['user_id'], 'signature': video_data['signature'], @@ -133,7 +134,7 @@ class DCNLiveIE(DCNBaseIE): webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'id': base64.b64encode(channel_data['user_id'].encode()).decode(), 'channelid': base64.b64encode(channel_data['id'].encode()).decode(), 'signature': channel_data['signature'], @@ -174,7 +175,7 @@ class DCNSeasonIE(InfoExtractor): data['show_id'] = show_id request = sanitized_Request( 'http://admin.mangomolo.com/analytics/index.php/plus/show', - compat_urllib_parse.urlencode(data), + urlencode_postdata(data), { 'Origin': 'http://www.dcndigital.ae', 'Content-Type': 'application/x-www-form-urlencoded' diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index aa2c09eb6..9099f5046 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -6,7 +6,7 @@ from ..compat import compat_str class DctpTvIE(InfoExtractor): - _VALID_URL = r'http://www.dctp.tv/(#/)?filme/(?P.+?)/$' + _VALID_URL = r'https?://www.dctp.tv/(#/)?filme/(?P.+?)/$' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { diff --git a/youtube_dl/extractor/deezer.py b/youtube_dl/extractor/deezer.py index c3205ff5f..7a07f3267 100644 --- a/youtube_dl/extractor/deezer.py +++ b/youtube_dl/extractor/deezer.py @@ -41,7 +41,9 @@ class DeezerPlaylistIE(InfoExtractor): 'Deezer said: %s' % geoblocking_msg, expected=True) data_json = self._search_regex( - r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n', webpage, 'data JSON') + (r'__DZR_APP_STATE__\s*=\s*({.+?})\s*', + r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n'), + webpage, 'data JSON') data = json.loads(data_json) playlist_title = data.get('DATA', {}).get('TITLE') diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py index 98e3aedfd..9fe144e14 100644 --- a/youtube_dl/extractor/defense.py +++ b/youtube_dl/extractor/defense.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class DefenseGouvFrIE(InfoExtractor): IE_NAME = 'defense.gouv.fr' - _VALID_URL = r'http://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P[^/?#]*)' + _VALID_URL = r'https?://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P[^/?#]*)' _TEST = { 'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1', diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 6cd395e11..65a98d789 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -17,37 +17,53 @@ class DemocracynowIE(InfoExtractor): IE_NAME = 'democracynow' _TESTS = [{ 'url': 'http://www.democracynow.org/shows/2015/7/3', - 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d', + 'md5': '3757c182d3d84da68f5c8f506c18c196', 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', - 'title': 'July 03, 2015 - Democracy Now!', - 'description': 'A daily independent global news hour with Amy Goodman & Juan González "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', + 'title': 'Daily Show', }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', - 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d', 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag', 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21', }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - description = self._og_search_description(webpage) json_data = self._parse_json(self._search_regex( r']+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'), display_id) - video_id = None + + title = json_data['title'] formats = [] - default_lang = 'en' + video_id = None + for key in ('file', 'audio', 'video', 'high_res_video'): + media_url = json_data.get(key, '') + if not media_url: + continue + media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) + video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn') + formats.append({ + 'url': media_url, + 'vcodec': 'none' if key == 'audio' else None, + }) + + self._sort_formats(formats) + + default_lang = 'en' subtitles = {} def add_subtitle_item(lang, info_dict): @@ -67,22 +83,13 @@ class DemocracynowIE(InfoExtractor): 'url': compat_urlparse.urljoin(url, subtitle_item['url']), }) - for key in ('file', 'audio', 'video'): - media_url = json_data.get(key, '') - if not media_url: - continue - media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) - video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn') - formats.append({ - 'url': media_url, - }) - - self._sort_formats(formats) + description = self._og_search_description(webpage, default=None) return { 'id': video_id or display_id, - 'title': json_data['title'], + 'title': title, 'description': description, + 'thumbnail': json_data.get('image'), 'subtitles': subtitles, 'formats': formats, } diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index 263532cc6..cdfeccacb 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -38,6 +38,7 @@ class DFBIE(InfoExtractor): token_el = f4m_info.find('token') manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' formats = self._extract_f4m_formats(manifest_url, display_id) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index ce680a9f3..5f1275b39 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -9,7 +9,7 @@ from ..compat import compat_str class DiscoveryIE(InfoExtractor): - _VALID_URL = r'''(?x)http://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: discovery| investigationdiscovery| discoverylife| @@ -63,18 +63,23 @@ class DiscoveryIE(InfoExtractor): video_title = info.get('playlist_title') or info.get('video_title') - entries = [{ - 'id': compat_str(video_info['id']), - 'formats': self._extract_m3u8_formats( + entries = [] + + for idx, video_info in enumerate(info['playlist']): + formats = self._extract_m3u8_formats( video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls', - note='Download m3u8 information for video %d' % (idx + 1)), - 'title': video_info['title'], - 'description': video_info.get('description'), - 'duration': parse_duration(video_info.get('video_length')), - 'webpage_url': video_info.get('href') or video_info.get('url'), - 'thumbnail': video_info.get('thumbnailURL'), - 'alt_title': video_info.get('secondary_title'), - 'timestamp': parse_iso8601(video_info.get('publishedDate')), - } for idx, video_info in enumerate(info['playlist'])] + note='Download m3u8 information for video %d' % (idx + 1)) + self._sort_formats(formats) + entries.append({ + 'id': compat_str(video_info['id']), + 'formats': formats, + 'title': video_info['title'], + 'description': video_info.get('description'), + 'duration': parse_duration(video_info.get('video_length')), + 'webpage_url': video_info.get('href') or video_info.get('url'), + 'thumbnail': video_info.get('thumbnailURL'), + 'alt_title': video_info.get('secondary_title'), + 'timestamp': parse_iso8601(video_info.get('publishedDate')), + }) return self.playlist_result(entries, display_id, video_title) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 373b3b4b4..3915cb182 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -10,7 +10,7 @@ from ..compat import (compat_str, compat_basestring) class DouyuTVIE(InfoExtractor): IE_DESC = '斗鱼' - _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { @@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor): 'display_id': 'iseven', 'ext': 'flv', 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:c93d6692dde6fe33809a46edcbecca44', + 'description': 'md5:f34981259a03e980a3c6404190a3ed61', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', 'uploader_id': '431925', @@ -26,7 +26,7 @@ class DouyuTVIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'http://www.douyutv.com/85982', 'info_dict': { @@ -42,7 +42,27 @@ class DouyuTVIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Romm not found', + }, { + 'url': 'http://www.douyutv.com/17732', + 'info_dict': { + 'id': '17732', + 'display_id': '17732', + 'ext': 'flv', + 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:f34981259a03e980a3c6404190a3ed61', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': '7师傅', + 'uploader_id': '431925', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.douyu.com/xiaocang', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 6cda56a7f..66bbfc6ca 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -1,6 +1,8 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import json +import re import time from .common import InfoExtractor @@ -8,44 +10,127 @@ from ..utils import int_or_none class DPlayIE(InfoExtractor): - _VALID_URL = r'http://www\.dplay\.se/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P[^/?#]+)' - _TEST = { + _TESTS = [{ + 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', + 'info_dict': { + 'id': '1255600', + 'display_id': 'stagione-1-episodio-25', + 'ext': 'mp4', + 'title': 'Episodio 25', + 'description': 'md5:cae5f40ad988811b197d2d27a53227eb', + 'duration': 2761, + 'timestamp': 1454701800, + 'upload_date': '20160205', + 'creator': 'RTIT', + 'series': 'Take me out', + 'season_number': 1, + 'episode_number': 25, + 'age_limit': 0, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', 'info_dict': { 'id': '3172', - 'ext': 'mp4', 'display_id': 'season-1-svensken-lar-sig-njuta-av-livet', + 'ext': 'flv', 'title': 'Svensken lär sig njuta av livet', + 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8', 'duration': 2650, + 'timestamp': 1365454320, + 'upload_date': '20130408', + 'creator': 'Kanal 5 (Home)', + 'series': 'Nugammalt - 77 händelser som format Sverige', + 'season_number': 1, + 'episode_number': 1, + 'age_limit': 0, }, - } + }, { + 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/', + 'info_dict': { + 'id': '70816', + 'display_id': 'season-6-episode-12', + 'ext': 'flv', + 'title': 'Episode 12', + 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90', + 'duration': 2563, + 'timestamp': 1429696800, + 'upload_date': '20150422', + 'creator': 'Kanal 4', + 'series': 'Mig og min mor', + 'season_number': 6, + 'episode_number': 12, + 'age_limit': 0, + }, + }, { + 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/', + 'only_matching': True, + }] def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + domain = mobj.group('domain') + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( - r'data-video-id="(\d+)"', webpage, 'video id') + r'data-video-id=["\'](\d+)', webpage, 'video id') info = self._download_json( - 'http://www.dplay.se/api/v2/ajax/videos?video_id=' + video_id, + 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), video_id)['data'][0] - self._set_cookie( - 'secure.dplay.se', 'dsc-geo', - '{"countryCode":"NL","expiry":%d}' % ((time.time() + 20 * 60) * 1000)) - # TODO: consider adding support for 'stream_type=hds', it seems to - # require setting some cookies - manifest_url = self._download_json( - 'https://secure.dplay.se/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % video_id, - video_id, 'Getting manifest url for hls stream')['hls'] - formats = self._extract_m3u8_formats( - manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native') + title = info['title'] + + PROTOCOLS = ('hls', 'hds') + formats = [] + + def extract_formats(protocol, manifest_url): + if protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False)) + elif protocol == 'hds': + formats.extend(self._extract_f4m_formats( + manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0', + video_id, f4m_id=protocol, fatal=False)) + + domain_tld = domain.split('.')[-1] + if domain_tld in ('se', 'dk'): + for protocol in PROTOCOLS: + self._set_cookie( + 'secure.dplay.%s' % domain_tld, 'dsc-geo', + json.dumps({ + 'countryCode': domain_tld.upper(), + 'expiry': (time.time() + 20 * 60) * 1000, + })) + stream = self._download_json( + 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=%s' + % (domain_tld, video_id, protocol), video_id, + 'Downloading %s stream JSON' % protocol, fatal=False) + if stream and stream.get(protocol): + extract_formats(protocol, stream[protocol]) + else: + for protocol in PROTOCOLS: + if info.get(protocol): + extract_formats(protocol, info[protocol]) + + self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, - 'title': info['title'], - 'formats': formats, + 'title': title, + 'description': info.get('video_metadata_longDescription'), 'duration': int_or_none(info.get('video_metadata_length'), scale=1000), + 'timestamp': int_or_none(info.get('video_publish_date')), + 'creator': info.get('video_metadata_homeChannel'), + 'series': info.get('video_metadata_show'), + 'season_number': int_or_none(info.get('season')), + 'episode_number': int_or_none(info.get('episode')), + 'age_limit': int_or_none(info.get('minimum_age')), + 'formats': formats, } diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index d35e88881..3b6529f4b 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -6,7 +6,6 @@ import itertools from .amp import AMPIE from ..compat import ( compat_HTTPError, - compat_urllib_parse, compat_urlparse, ) from ..utils import ( @@ -14,6 +13,7 @@ from ..utils import ( clean_html, int_or_none, sanitized_Request, + urlencode_postdata ) @@ -50,7 +50,7 @@ class DramaFeverBaseIE(AMPIE): } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) response = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 028144f20..0040e70d4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -7,7 +7,7 @@ from .zdf import ZDFIE class DreiSatIE(ZDFIE): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' + _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' _TESTS = [ { 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index c1a4bc757..974c69dbc 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -15,7 +15,7 @@ class DVTVIE(InfoExtractor): IE_NAME = 'dvtv' IE_DESC = 'http://video.aktualne.cz/' - _VALID_URL = r'http://video\.aktualne\.cz/(?:[^/]+/)+r~(?P[0-9a-f]{32})' + _VALID_URL = r'https?://video\.aktualne\.cz/(?:[^/]+/)+r~(?P[0-9a-f]{32})' _TESTS = [{ 'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/', diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py new file mode 100644 index 000000000..ae7c571bd --- /dev/null +++ b/youtube_dl/extractor/dw.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none +from ..compat import compat_urlparse + + +class DWIE(InfoExtractor): + IE_NAME = 'dw' + _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P\d+)' + _TESTS = [{ + # video + 'url': 'http://www.dw.com/en/intelligent-light/av-19112290', + 'md5': '7372046e1815c5a534b43f3c3c36e6e9', + 'info_dict': { + 'id': '19112290', + 'ext': 'mp4', + 'title': 'Intelligent light', + 'description': 'md5:90e00d5881719f2a6a5827cb74985af1', + 'upload_date': '20160311', + } + }, { + # audio + 'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941', + 'md5': '2814c9a1321c3a51f8a7aeb067a360dd', + 'info_dict': { + 'id': '19111941', + 'ext': 'mp3', + 'title': 'WorldLink: My business', + 'description': 'md5:bc9ca6e4e063361e21c920c53af12405', + 'upload_date': '20160311', + } + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) + hidden_inputs = self._hidden_inputs(webpage) + title = hidden_inputs['media_title'] + + if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': + formats = self._extract_smil_formats( + 'http://www.dw.com/smil/v-%s' % media_id, media_id, + transform_source=lambda s: s.replace( + 'rtmp://tv-od.dw.de/flash/', + 'http://tv-download.dw.de/dwtv_video/flv/')) + self._sort_formats(formats) + else: + formats = [{'url': hidden_inputs['file_name']}] + + return { + 'id': media_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': hidden_inputs.get('preview_image'), + 'duration': int_or_none(hidden_inputs.get('file_duration')), + 'upload_date': hidden_inputs.get('display_date'), + 'formats': formats, + } + + +class DWArticleIE(InfoExtractor): + IE_NAME = 'dw:article' + _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P\d+)' + _TEST = { + 'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009', + 'md5': '8ca657f9d068bbef74d6fc38b97fc869', + 'info_dict': { + 'id': '19105868', + 'ext': 'mp4', + 'title': 'The harsh life of refugees in Idomeni', + 'description': 'md5:196015cc7e48ebf474db9399420043c7', + 'upload_date': '20160310', + } + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + hidden_inputs = self._hidden_inputs(webpage) + media_id = hidden_inputs['media_id'] + media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url') + media_url = compat_urlparse.urljoin(url, media_path) + return self.url_result(media_url, 'DW', media_id) diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py index b6bfd2b2d..c97682cd3 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/youtube_dl/extractor/ebaumsworld.py @@ -4,10 +4,10 @@ from .common import InfoExtractor class EbaumsWorldIE(InfoExtractor): - _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?ebaumsworld\.com/videos/[^/]+/(?P\d+)' _TEST = { - 'url': 'http://www.ebaumsworld.com/video/watch/83367677/', + 'url': 'http://www.ebaumsworld.com/videos/a-giant-python-opens-the-door/83367677/', 'info_dict': { 'id': '83367677', 'ext': 'mp4', diff --git a/youtube_dl/extractor/echomsk.py b/youtube_dl/extractor/echomsk.py index d2d94049d..6b7cc652f 100644 --- a/youtube_dl/extractor/echomsk.py +++ b/youtube_dl/extractor/echomsk.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class EchoMskIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?echo\.msk\.ru/sounds/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?echo\.msk\.ru/sounds/(?P\d+)' _TEST = { 'url': 'http://www.echo.msk.ru/sounds/1464134.html', 'md5': '2e44b3b78daff5b458e4dbc37f191f7c', diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py index 00a69e631..8c725a4e6 100644 --- a/youtube_dl/extractor/elpais.py +++ b/youtube_dl/extractor/elpais.py @@ -9,7 +9,7 @@ class ElPaisIE(InfoExtractor): _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P[^/#?]+)\.html(?:$|[?#])' IE_DESC = 'El País' - _TEST = { + _TESTS = [{ 'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html', 'md5': '98406f301f19562170ec071b83433d55', 'info_dict': { @@ -19,30 +19,41 @@ class ElPaisIE(InfoExtractor): 'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.', 'upload_date': '20140206', } - } + }, { + 'url': 'http://elcomidista.elpais.com/elcomidista/2016/02/24/articulo/1456340311_668921.html#?id_externo_nwl=newsletter_diaria20160303t', + 'md5': '3bd5b09509f3519d7d9e763179b013de', + 'info_dict': { + 'id': '1456340311_668921', + 'ext': 'mp4', + 'title': 'Cómo hacer el mejor café con cafetera italiana', + 'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.', + 'upload_date': '20160303', + } + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) prefix = self._html_search_regex( - r'var url_cache = "([^"]+)";', webpage, 'URL prefix') + r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix') video_suffix = self._search_regex( - r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL') + r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL') video_url = prefix + video_suffix thumbnail_suffix = self._search_regex( - r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL', - fatal=False) + r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", + webpage, 'thumbnail URL', fatal=False) thumbnail = ( None if thumbnail_suffix is None else prefix + thumbnail_suffix) title = self._html_search_regex( - '

', - webpage, 'upload date', fatal=False) - upload_date = (None if date_str is None else unified_strdate(date_str)) + webpage, 'upload date', default=None) or self._html_search_meta( + 'datePublished', webpage, 'timestamp')) return { 'id': video_id, diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index e4180701d..e5e57d485 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -1,21 +1,13 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - url_basename, -) class EngadgetIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://www.engadget.com/ - (?:video(?:/5min)?/(?P\d+)| - [\d/]+/.*?) - ''' + _VALID_URL = r'https?://www.engadget.com/video/(?P\d+)' _TEST = { - 'url': 'http://www.engadget.com/video/5min/518153925/', + 'url': 'http://www.engadget.com/video/518153925/', 'md5': 'c6820d4828a5064447a4d9fc73f312c9', 'info_dict': { 'id': '518153925', @@ -27,15 +19,4 @@ class EngadgetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - if video_id is not None: - return self.url_result('5min:%s' % video_id) - else: - title = url_basename(url) - webpage = self._download_webpage(url, title) - ids = re.findall(r']+?playList=(\d+)', webpage) - return { - '_type': 'playlist', - 'title': title, - 'entries': [self.url_result('5min:%s' % vid) for vid in ids] - } + return self.url_result('5min:%s' % video_id) diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 7fcd0151d..297f8a6f5 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, unescapeHTML @@ -43,7 +43,7 @@ class EroProfileIE(InfoExtractor): if username is None: return - query = compat_urllib_parse.urlencode({ + query = compat_urllib_parse_urlencode({ 'username': username, 'password': password, 'url': 'http://www.eroprofile.com/', diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index 0c0fe6d65..09ed4f2b5 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class ExfmIE(InfoExtractor): IE_NAME = 'exfm' IE_DESC = 'ex.fm' - _VALID_URL = r'http://(?:www\.)?ex\.fm/song/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?ex\.fm/song/(?P[^/]+)' _SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' _TESTS = [ { diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py new file mode 100644 index 000000000..d262b9795 --- /dev/null +++ b/youtube_dl/extractor/extractors.py @@ -0,0 +1,995 @@ +# flake8: noqa +from __future__ import unicode_literals + +from .abc import ABCIE +from .abc7news import Abc7NewsIE +from .academicearth import AcademicEarthCourseIE +from .acast import ( + ACastIE, + ACastChannelIE, +) +from .addanime import AddAnimeIE +from .adobetv import ( + AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, + AdobeTVVideoIE, +) +from .adultswim import AdultSwimIE +from .aenetworks import AENetworksIE +from .aftonbladet import AftonbladetIE +from .airmozilla import AirMozillaIE +from .aljazeera import AlJazeeraIE +from .alphaporno import AlphaPornoIE +from .animeondemand import AnimeOnDemandIE +from .anitube import AnitubeIE +from .anysex import AnySexIE +from .aol import ( + AolIE, + AolFeaturesIE, +) +from .allocine import AllocineIE +from .aparat import AparatIE +from .appleconnect import AppleConnectIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) +from .archiveorg import ArchiveOrgIE +from .ard import ( + ARDIE, + ARDMediathekIE, + SportschauIE, +) +from .arkenaplay import ArkenaPlayIE +from .arte import ( + ArteTvIE, + ArteTVPlus7IE, + ArteTVCreativeIE, + ArteTVConcertIE, + ArteTVFutureIE, + ArteTVCinemaIE, + ArteTVDDCIE, + ArteTVMagazineIE, + ArteTVEmbedIE, +) +from .atresplayer import AtresPlayerIE +from .atttechchannel import ATTTechChannelIE +from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE +from .audiomack import AudiomackIE, AudiomackAlbumIE +from .azubu import AzubuIE, AzubuLiveIE +from .baidu import BaiduVideoIE +from .bambuser import BambuserIE, BambuserChannelIE +from .bandcamp import BandcampIE, BandcampAlbumIE +from .bbc import ( + BBCCoUkIE, + BBCCoUkArticleIE, + BBCIE, +) +from .beeg import BeegIE +from .behindkink import BehindKinkIE +from .beatportpro import BeatportProIE +from .bet import BetIE +from .bigflix import BigflixIE +from .bild import BildIE +from .bilibili import BiliBiliIE +from .biobiochiletv import BioBioChileTVIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) +from .blinkx import BlinkxIE +from .bloomberg import BloombergIE +from .bokecc import BokeCCIE +from .bpb import BpbIE +from .br import BRIE +from .bravotv import BravoTVIE +from .breakcom import BreakIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .buzzfeed import BuzzFeedIE +from .byutv import BYUtvIE +from .c56 import C56IE +from .camdemy import ( + CamdemyIE, + CamdemyFolderIE +) +from .camwithher import CamWithHerIE +from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .canvas import CanvasIE +from .cbc import ( + CBCIE, + CBCPlayerIE, +) +from .cbs import CBSIE +from .cbsinteractive import CBSInteractiveIE +from .cbsnews import ( + CBSNewsIE, + CBSNewsLiveVideoIE, +) +from .cbssports import CBSSportsIE +from .ccc import CCCIE +from .cda import CDAIE +from .ceskatelevize import CeskaTelevizeIE +from .channel9 import Channel9IE +from .chaturbate import ChaturbateIE +from .chilloutzone import ChilloutzoneIE +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) +from .cinchcast import CinchcastIE +from .cinemassacre import CinemassacreIE +from .cliprs import ClipRsIE +from .clipfish import ClipfishIE +from .cliphunter import CliphunterIE +from .clipsyndicate import ClipsyndicateIE +from .cloudy import CloudyIE +from .clubic import ClubicIE +from .clyp import ClypIE +from .cmt import CMTIE +from .cnbc import CNBCIE +from .cnn import ( + CNNIE, + CNNBlogsIE, + CNNArticleIE, +) +from .collegehumor import CollegeHumorIE +from .collegerama import CollegeRamaIE +from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE +from .comcarcoff import ComCarCoffIE +from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import RtmpIE +from .condenast import CondeNastIE +from .cracked import CrackedIE +from .crackle import CrackleIE +from .criterion import CriterionIE +from .crooksandliars import CrooksAndLiarsIE +from .crunchyroll import ( + CrunchyrollIE, + CrunchyrollShowPlaylistIE +) +from .cspan import CSpanIE +from .ctsnews import CtsNewsIE +from .cultureunplugged import CultureUnpluggedIE +from .cwtv import CWTVIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionUserIE, + DailymotionCloudIE, +) +from .daum import ( + DaumIE, + DaumClipIE, + DaumPlaylistIE, + DaumUserIE, +) +from .dbtv import DBTVIE +from .dcn import ( + DCNIE, + DCNVideoIE, + DCNLiveIE, + DCNSeasonIE, +) +from .dctp import DctpTvIE +from .deezer import DeezerPlaylistIE +from .democracynow import DemocracynowIE +from .dfb import DFBIE +from .dhm import DHMIE +from .dotsub import DotsubIE +from .douyutv import DouyuTVIE +from .dplay import DPlayIE +from .dramafever import ( + DramaFeverIE, + DramaFeverSeriesIE, +) +from .dreisat import DreiSatIE +from .drbonanza import DRBonanzaIE +from .drtuber import DrTuberIE +from .drtv import DRTVIE +from .dvtv import DVTVIE +from .dump import DumpIE +from .dumpert import DumpertIE +from .defense import DefenseGouvFrIE +from .discovery import DiscoveryIE +from .dropbox import DropboxIE +from .dw import ( + DWIE, + DWArticleIE, +) +from .eagleplatform import EaglePlatformIE +from .ebaumsworld import EbaumsWorldIE +from .echomsk import EchoMskIE +from .ehow import EHowIE +from .eighttracks import EightTracksIE +from .einthusan import EinthusanIE +from .eitb import EitbIE +from .ellentv import ( + EllenTVIE, + EllenTVClipsIE, +) +from .elpais import ElPaisIE +from .embedly import EmbedlyIE +from .engadget import EngadgetIE +from .eporner import EpornerIE +from .eroprofile import EroProfileIE +from .escapist import EscapistIE +from .espn import ESPNIE +from .esri import EsriVideoIE +from .europa import EuropaIE +from .everyonesmixtape import EveryonesMixtapeIE +from .exfm import ExfmIE +from .expotv import ExpoTVIE +from .extremetube import ExtremeTubeIE +from .facebook import FacebookIE +from .faz import FazIE +from .fc2 import FC2IE +from .fczenit import FczenitIE +from .firstpost import FirstpostIE +from .firsttv import FirstTVIE +from .fivemin import FiveMinIE +from .fivetv import FiveTVIE +from .fktv import FKTVIE +from .flickr import FlickrIE +from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE +from .fourtube import FourTubeIE +from .fox import FOXIE +from .foxgay import FoxgayIE +from .foxnews import FoxNewsIE +from .foxsports import FoxSportsIE +from .franceculture import ( + FranceCultureIE, + FranceCultureEmissionIE, +) +from .franceinter import FranceInterIE +from .francetv import ( + PluzzIE, + FranceTvInfoIE, + FranceTVIE, + GenerationQuoiIE, + CultureboxIE, +) +from .freesound import FreesoundIE +from .freespeech import FreespeechIE +from .freevideo import FreeVideoIE +from .funimation import FunimationIE +from .funnyordie import FunnyOrDieIE +from .gameinformer import GameInformerIE +from .gamekings import GamekingsIE +from .gameone import ( + GameOneIE, + GameOnePlaylistIE, +) +from .gamersyde import GamersydeIE +from .gamespot import GameSpotIE +from .gamestar import GameStarIE +from .gametrailers import GametrailersIE +from .gazeta import GazetaIE +from .gdcvault import GDCVaultIE +from .generic import GenericIE +from .gfycat import GfycatIE +from .giantbomb import GiantBombIE +from .giga import GigaIE +from .glide import GlideIE +from .globo import ( + GloboIE, + GloboArticleIE, +) +from .godtube import GodTubeIE +from .goldenmoustache import GoldenMoustacheIE +from .golem import GolemIE +from .googledrive import GoogleDriveIE +from .googleplus import GooglePlusIE +from .googlesearch import GoogleSearchIE +from .goshgay import GoshgayIE +from .gputechconf import GPUTechConfIE +from .groupon import GrouponIE +from .hark import HarkIE +from .hbo import HBOIE +from .hearthisat import HearThisAtIE +from .heise import HeiseIE +from .hellporno import HellPornoIE +from .helsinki import HelsinkiIE +from .hentaistigma import HentaiStigmaIE +from .historicfilms import HistoricFilmsIE +from .hitbox import HitboxIE, HitboxLiveIE +from .hornbunny import HornBunnyIE +from .hotnewhiphop import HotNewHipHopIE +from .hotstar import HotStarIE +from .howcast import HowcastIE +from .howstuffworks import HowStuffWorksIE +from .huffpost import HuffPostIE +from .hypem import HypemIE +from .iconosquare import IconosquareIE +from .ign import ( + IGNIE, + OneUPIE, + PCMagIE, +) +from .imdb import ( + ImdbIE, + ImdbListIE +) +from .imgur import ( + ImgurIE, + ImgurAlbumIE, +) +from .ina import InaIE +from .indavideo import ( + IndavideoIE, + IndavideoEmbedIE, +) +from .infoq import InfoQIE +from .instagram import InstagramIE, InstagramUserIE +from .internetvideoarchive import InternetVideoArchiveIE +from .iprima import IPrimaIE +from .iqiyi import IqiyiIE +from .ir90tv import Ir90TvIE +from .ivi import ( + IviIE, + IviCompilationIE +) +from .ivideon import IvideonIE +from .izlesene import IzleseneIE +from .jadorecettepub import JadoreCettePubIE +from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE +from .jwplatform import JWPlatformIE +from .jpopsukitv import JpopsukiIE +from .kaltura import KalturaIE +from .kanalplay import KanalPlayIE +from .kankan import KankanIE +from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE +from .keezmovies import KeezMoviesIE +from .khanacademy import KhanAcademyIE +from .kickstarter import KickStarterIE +from .keek import KeekIE +from .konserthusetplay import KonserthusetPlayIE +from .kontrtube import KontrTubeIE +from .krasview import KrasViewIE +from .ku6 import Ku6IE +from .kusi import KUSIIE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoCategoryIE, + KuwoMvIE, +) +from .la7 import LA7IE +from .laola1tv import Laola1TvIE +from .lcp import LcpIE +from .lecture2go import Lecture2GoIE +from .lemonde import LemondeIE +from .leeco import ( + LeIE, + LePlaylistIE, + LetvCloudIE, +) +from .libsyn import LibsynIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) +from .liveleak import LiveLeakIE +from .livestream import ( + LivestreamIE, + LivestreamOriginalIE, + LivestreamShortenerIE, +) +from .lnkgo import LnkGoIE +from .lovehomeporn import LoveHomePornIE +from .lrt import LRTIE +from .lynda import ( + LyndaIE, + LyndaCourseIE +) +from .m6 import M6IE +from .macgamestore import MacGameStoreIE +from .mailru import MailRuIE +from .makerschannel import MakersChannelIE +from .makertv import MakerTVIE +from .malemotion import MalemotionIE +from .matchtv import MatchTVIE +from .mdr import MDRIE +from .metacafe import MetacafeIE +from .metacritic import MetacriticIE +from .mgoon import MgoonIE +from .minhateca import MinhatecaIE +from .ministrygrid import MinistryGridIE +from .minoto import MinotoIE +from .miomio import MioMioIE +from .mit import TechTVMITIE, MITIE, OCWMITIE +from .mitele import MiTeleIE +from .mixcloud import MixcloudIE +from .mlb import MLBIE +from .mnet import MnetIE +from .mpora import MporaIE +from .moevideo import MoeVideoIE +from .mofosex import MofosexIE +from .mojvideo import MojvideoIE +from .moniker import MonikerIE +from .mooshare import MooshareIE +from .morningstar import MorningstarIE +from .motherless import MotherlessIE +from .motorsport import MotorsportIE +from .movieclips import MovieClipsIE +from .moviezine import MoviezineIE +from .mtv import ( + MTVIE, + MTVServicesEmbeddedIE, + MTVIggyIE, + MTVDEIE, +) +from .muenchentv import MuenchenTVIE +from .musicplayon import MusicPlayOnIE +from .muzu import MuzuTVIE +from .mwave import MwaveIE +from .myspace import MySpaceIE, MySpaceAlbumIE +from .myspass import MySpassIE +from .myvi import MyviIE +from .myvideo import MyVideoIE +from .myvidster import MyVidsterIE +from .nationalgeographic import ( + NationalGeographicIE, + NationalGeographicChannelIE, +) +from .naver import NaverIE +from .nba import NBAIE +from .nbc import ( + CSNNEIE, + NBCIE, + NBCNewsIE, + NBCSportsIE, + NBCSportsVPlayerIE, + MSNBCIE, +) +from .ndr import ( + NDRIE, + NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, +) +from .ndtv import NDTVIE +from .netzkino import NetzkinoIE +from .nerdcubed import NerdCubedFeedIE +from .nerdist import NerdistIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) +from .newgrounds import NewgroundsIE +from .newstube import NewstubeIE +from .nextmedia import ( + NextMediaIE, + NextMediaActionNewsIE, + AppleDailyIE, +) +from .nextmovie import NextMovieIE +from .nfb import NFBIE +from .nfl import NFLIE +from .nhl import ( + NHLIE, + NHLNewsIE, + NHLVideocenterIE, +) +from .nick import NickIE +from .niconico import NiconicoIE, NiconicoPlaylistIE +from .ninegag import NineGagIE +from .noco import NocoIE +from .normalboots import NormalbootsIE +from .nosvideo import NosVideoIE +from .nova import NovaIE +from .novamov import ( + AuroraVidIE, + CloudTimeIE, + NowVideoIE, + VideoWeedIE, + WholeCloudIE, +) +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) +from .nowtv import ( + NowTVIE, + NowTVListIE, +) +from .noz import NozIE +from .npo import ( + NPOIE, + NPOLiveIE, + NPORadioIE, + NPORadioFragmentIE, + SchoolTVIE, + VPROIE, + WNLIE +) +from .npr import NprIE +from .nrk import ( + NRKIE, + NRKPlaylistIE, + NRKSkoleIE, + NRKTVIE, +) +from .ntvde import NTVDeIE +from .ntvru import NTVRuIE +from .nytimes import ( + NYTimesIE, + NYTimesArticleIE, +) +from .nuvid import NuvidIE +from .odnoklassniki import OdnoklassnikiIE +from .oktoberfesttv import OktoberfestTVIE +from .onionstudios import OnionStudiosIE +from .ooyala import ( + OoyalaIE, + OoyalaExternalIE, +) +from .openload import OpenloadIE +from .ora import OraTVIE +from .orf import ( + ORFTVthekIE, + ORFOE1IE, + ORFFM4IE, + ORFIPTVIE, +) +from .pandoratv import PandoraTVIE +from .parliamentliveuk import ParliamentLiveUKIE +from .patreon import PatreonIE +from .pbs import PBSIE +from .periscope import PeriscopeIE +from .philharmoniedeparis import PhilharmonieDeParisIE +from .phoenix import PhoenixIE +from .photobucket import PhotobucketIE +from .pinkbike import PinkbikeIE +from .planetaplay import PlanetaPlayIE +from .pladform import PladformIE +from .played import PlayedIE +from .playfm import PlayFMIE +from .plays import PlaysTVIE +from .playtvak import PlaytvakIE +from .playvid import PlayvidIE +from .playwire import PlaywireIE +from .pluralsight import ( + PluralsightIE, + PluralsightCourseIE, +) +from .podomatic import PodomaticIE +from .porn91 import Porn91IE +from .pornhd import PornHdIE +from .pornhub import ( + PornHubIE, + PornHubPlaylistIE, + PornHubUserVideosIE, +) +from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE +from .pornoxo import PornoXOIE +from .presstv import PressTVIE +from .primesharetv import PrimeShareTVIE +from .promptfile import PromptFileIE +from .prosiebensat1 import ProSiebenSat1IE +from .puls4 import Puls4IE +from .pyvideo import PyvideoIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, + QQMusicToplistIE, + QQMusicPlaylistIE, +) +from .quickvid import QuickVidIE +from .r7 import R7IE +from .radiode import RadioDeIE +from .radiojavan import RadioJavanIE +from .radiobremen import RadioBremenIE +from .radiofrance import RadioFranceIE +from .rai import ( + RaiTVIE, + RaiIE, +) +from .rbmaradio import RBMARadioIE +from .rds import RDSIE +from .redtube import RedTubeIE +from .regiotv import RegioTVIE +from .restudy import RestudyIE +from .reverbnation import ReverbNationIE +from .revision3 import Revision3IE +from .rice import RICEIE +from .ringtv import RingTVIE +from .ro220 import Ro220IE +from .rottentomatoes import RottenTomatoesIE +from .roxwel import RoxwelIE +from .rtbf import RTBFIE +from .rte import RteIE, RteRadioIE +from .rtlnl import RtlNlIE +from .rtl2 import RTL2IE +from .rtp import RTPIE +from .rts import RTSIE +from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE +from .rtvnh import RTVNHIE +from .ruhd import RUHDIE +from .ruleporn import RulePornIE +from .rutube import ( + RutubeIE, + RutubeChannelIE, + RutubeEmbedIE, + RutubeMovieIE, + RutubePersonIE, +) +from .rutv import RUTVIE +from .ruutu import RuutuIE +from .sandia import SandiaIE +from .safari import ( + SafariIE, + SafariApiIE, + SafariCourseIE, +) +from .sapo import SapoIE +from .savefrom import SaveFromIE +from .sbs import SBSIE +from .scivee import SciVeeIE +from .screencast import ScreencastIE +from .screencastomatic import ScreencastOMaticIE +from .screenjunkies import ScreenJunkiesIE +from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE +from .senateisvp import SenateISVPIE +from .servingsys import ServingSysIE +from .sexu import SexuIE +from .sexykarma import SexyKarmaIE +from .shahid import ShahidIE +from .shared import SharedIE +from .sharesix import ShareSixIE +from .sina import SinaIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) +from .slideshare import SlideshareIE +from .slutload import SlutloadIE +from .smotri import ( + SmotriIE, + SmotriCommunityIE, + SmotriUserIE, + SmotriBroadcastIE, +) +from .snagfilms import ( + SnagFilmsIE, + SnagFilmsEmbedIE, +) +from .snotr import SnotrIE +from .sohu import SohuIE +from .soundcloud import ( + SoundcloudIE, + SoundcloudSetIE, + SoundcloudUserIE, + SoundcloudPlaylistIE, + SoundcloudSearchIE +) +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) +from .southpark import ( + SouthParkIE, + SouthParkDeIE, + SouthParkDkIE, + SouthParkEsIE, + SouthParkNlIE +) +from .spankbang import SpankBangIE +from .spankwire import SpankwireIE +from .spiegel import SpiegelIE, SpiegelArticleIE +from .spiegeltv import SpiegeltvIE +from .spike import SpikeIE +from .stitcher import StitcherIE +from .sport5 import Sport5IE +from .sportbox import ( + SportBoxIE, + SportBoxEmbedIE, +) +from .sportdeutschland import SportDeutschlandIE +from .srgssr import ( + SRGSSRIE, + SRGSSRPlayIE, +) +from .srmediathek import SRMediathekIE +from .ssa import SSAIE +from .stanfordoc import StanfordOpenClassroomIE +from .steam import SteamIE +from .streamcloud import StreamcloudIE +from .streamcz import StreamCZIE +from .streetvoice import StreetVoiceIE +from .sunporno import SunPornoIE +from .svt import ( + SVTIE, + SVTPlayIE, +) +from .swrmediathek import SWRMediathekIE +from .syfy import SyfyIE +from .sztvhu import SztvHuIE +from .tagesschau import TagesschauIE +from .tapely import TapelyIE +from .tass import TassIE +from .teachertube import ( + TeacherTubeIE, + TeacherTubeUserIE, +) +from .teachingchannel import TeachingChannelIE +from .teamcoco import TeamcocoIE +from .techtalks import TechTalksIE +from .ted import TEDIE +from .tele13 import Tele13IE +from .telebruxelles import TeleBruxellesIE +from .telecinco import TelecincoIE +from .telegraaf import TelegraafIE +from .telemb import TeleMBIE +from .teletask import TeleTaskIE +from .testurl import TestURLIE +from .tf1 import TF1IE +from .theintercept import TheInterceptIE +from .theonion import TheOnionIE +from .theplatform import ( + ThePlatformIE, + ThePlatformFeedIE, +) +from .thescene import TheSceneIE +from .thesixtyone import TheSixtyOneIE +from .thestar import TheStarIE +from .thisamericanlife import ThisAmericanLifeIE +from .thisav import ThisAVIE +from .tinypic import TinyPicIE +from .tlc import TlcDeIE +from .tmz import ( + TMZIE, + TMZArticleIE, +) +from .tnaflix import ( + TNAFlixNetworkEmbedIE, + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) +from .toggle import ToggleIE +from .thvideo import ( + THVideoIE, + THVideoPlaylistIE +) +from .toutv import TouTvIE +from .toypics import ToypicsUserIE, ToypicsIE +from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE +from .trollvids import TrollvidsIE +from .trutube import TruTubeIE +from .tube8 import Tube8IE +from .tubitv import TubiTvIE +from .tudou import ( + TudouIE, + TudouPlaylistIE, + TudouAlbumIE, +) +from .tumblr import TumblrIE +from .tunein import ( + TuneInClipIE, + TuneInStationIE, + TuneInProgramIE, + TuneInTopicIE, + TuneInShortenerIE, +) +from .turbo import TurboIE +from .tutv import TutvIE +from .tv2 import ( + TV2IE, + TV2ArticleIE, +) +from .tv3 import TV3IE +from .tv4 import TV4IE +from .tvc import ( + TVCIE, + TVCArticleIE, +) +from .tvigle import TvigleIE +from .tvland import TVLandIE +from .tvp import TvpIE, TvpSeriesIE +from .tvplay import TVPlayIE +from .tweakers import TweakersIE +from .twentyfourvideo import TwentyFourVideoIE +from .twentymin import TwentyMinutenIE +from .twentytwotracks import ( + TwentyTwoTracksIE, + TwentyTwoTracksGenreIE +) +from .twitch import ( + TwitchVideoIE, + TwitchChapterIE, + TwitchVodIE, + TwitchProfileIE, + TwitchPastBroadcastsIE, + TwitchBookmarksIE, + TwitchStreamIE, +) +from .twitter import ( + TwitterCardIE, + TwitterIE, + TwitterAmplifyIE, +) +from .ubu import UbuIE +from .udemy import ( + UdemyIE, + UdemyCourseIE +) +from .udn import UDNEmbedIE +from .digiteka import DigitekaIE +from .unistra import UnistraIE +from .urort import UrortIE +from .usatoday import USATodayIE +from .ustream import UstreamIE, UstreamChannelIE +from .ustudio import UstudioIE +from .varzesh3 import Varzesh3IE +from .vbox7 import Vbox7IE +from .veehd import VeeHDIE +from .veoh import VeohIE +from .vessel import VesselIE +from .vesti import VestiIE +from .vevo import VevoIE +from .vgtv import ( + BTArticleIE, + BTVestlendingenIE, + VGTVIE, +) +from .vh1 import VH1IE +from .vice import ( + ViceIE, + ViceShowIE, +) +from .viddler import ViddlerIE +from .videodetective import VideoDetectiveIE +from .videofyme import VideofyMeIE +from .videomega import VideoMegaIE +from .videomore import ( + VideomoreIE, + VideomoreVideoIE, + VideomoreSeasonIE, +) +from .videopremium import VideoPremiumIE +from .videott import VideoTtIE +from .vidme import ( + VidmeIE, + VidmeUserIE, + VidmeUserLikesIE, +) +from .vidzi import VidziIE +from .vier import VierIE, VierVideosIE +from .viewster import ViewsterIE +from .viidea import ViideaIE +from .vimeo import ( + VimeoIE, + VimeoAlbumIE, + VimeoChannelIE, + VimeoGroupsIE, + VimeoLikesIE, + VimeoOndemandIE, + VimeoReviewIE, + VimeoUserIE, + VimeoWatchLaterIE, +) +from .vimple import VimpleIE +from .vine import ( + VineIE, + VineUserIE, +) +from .viki import ( + VikiIE, + VikiChannelIE, +) +from .vk import ( + VKIE, + VKUserVideosIE, +) +from .vlive import VLiveIE +from .vodlocker import VodlockerIE +from .voicerepublic import VoiceRepublicIE +from .voxmedia import VoxMediaIE +from .vporn import VpornIE +from .vrt import VRTIE +from .vube import VubeIE +from .vuclip import VuClipIE +from .vulture import VultureIE +from .walla import WallaIE +from .washingtonpost import WashingtonPostIE +from .wat import WatIE +from .wayofthemaster import WayOfTheMasterIE +from .wdr import ( + WDRIE, + WDRMobileIE, + WDRMausIE, +) +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) +from .weibo import WeiboIE +from .weiqitv import WeiqiTVIE +from .wimp import WimpIE +from .wistia import WistiaIE +from .worldstarhiphop import WorldStarHipHopIE +from .wrzuta import WrzutaIE +from .wsj import WSJIE +from .xbef import XBefIE +from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, +) +from .xminus import XMinusIE +from .xnxx import XNXXIE +from .xstream import XstreamIE +from .xtube import XTubeUserIE, XTubeIE +from .xuite import XuiteIE +from .xvideos import XVideosIE +from .xxxymovies import XXXYMoviesIE +from .yahoo import ( + YahooIE, + YahooSearchIE, +) +from .yam import YamIE +from .yandexmusic import ( + YandexMusicTrackIE, + YandexMusicAlbumIE, + YandexMusicPlaylistIE, +) +from .yesjapan import YesJapanIE +from .yinyuetai import YinYueTaiIE +from .ynet import YnetIE +from .youjizz import YouJizzIE +from .youku import YoukuIE +from .youporn import YouPornIE +from .yourupload import YourUploadIE +from .youtube import ( + YoutubeIE, + YoutubeChannelIE, + YoutubeFavouritesIE, + YoutubeHistoryIE, + YoutubeLiveIE, + YoutubePlaylistIE, + YoutubePlaylistsIE, + YoutubeRecommendedIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubeShowIE, + YoutubeSubscriptionsIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeUserIE, + YoutubeWatchLaterIE, +) +from .zapiks import ZapiksIE +from .zdf import ZDFIE, ZDFChannelIE +from .zingmp3 import ( + ZingMp3SongIE, + ZingMp3AlbumIE, +) +from .zippcast import ZippCastIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index d2ff38140..f5bbd39d2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -37,7 +37,9 @@ class FacebookIE(InfoExtractor): video/embed| story\.php )\?(?:.*?)(?:v|video_id|story_fbid)=| - [^/]+/videos/(?:[^/]+/)? + [^/]+/videos/(?:[^/]+/)?| + [^/]+/posts/| + groups/[^/]+/permalink/ )| facebook: ) @@ -50,6 +52,8 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' + _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' + _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'md5': '6a40d33c0eccbb1af76cf0485a052659', @@ -81,6 +85,33 @@ class FacebookIE(InfoExtractor): 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', 'uploader': 'Demy de Zeeuw', }, + }, { + 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', + 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', + 'info_dict': { + 'id': '544765982287235', + 'ext': 'mp4', + 'title': '"What are you doing running in the snow?"', + 'uploader': 'FailArmy', + } + }, { + 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', + 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3', + 'info_dict': { + 'id': '1035862816472149', + 'ext': 'mp4', + 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog', + 'uploader': 'S. Saint', + }, + }, { + 'note': 'swf params escaped', + 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749', + 'md5': '97ba073838964d12c70566e0085c2b91', + 'info_dict': { + 'id': '10153664894881749', + 'ext': 'mp4', + 'title': 'Facebook video #10153664894881749', + }, }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -94,7 +125,7 @@ class FacebookIE(InfoExtractor): 'url': 'facebook:544765982287235', 'only_matching': True, }, { - 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', + 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, }] @@ -164,19 +195,19 @@ class FacebookIE(InfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url): - video_id = self._match_id(url) - req = sanitized_Request('https://www.facebook.com/video/video.php?v=%s' % video_id) + def _extract_from_url(self, url, video_id, fatal_if_no_video=True): + req = sanitized_Request(url) req.add_header('User-Agent', self._CHROME_USER_AGENT) webpage = self._download_webpage(req, video_id) video_data = None - BEFORE = '{swf.addParam(param[0], param[1]);});\n' + BEFORE = '{swf.addParam(param[0], param[1]);});' AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' - m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage) + m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage) if m: - data = dict(json.loads(m.group(1))) + swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"') + data = dict(json.loads(swf_params)) params_raw = compat_urllib_parse_unquote(data['params']) video_data = json.loads(params_raw)['video_data'] @@ -189,13 +220,15 @@ class FacebookIE(InfoExtractor): if not video_data: server_js_data = self._parse_json(self._search_regex( - r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id) + r'handleServerJS\(({.+})\);', webpage, 'server js data', default='{}'), video_id) for item in server_js_data.get('instances', []): if item[1][0] == 'VideoConfig': video_data = video_data_list2dict(item[2][0]['videoData']) break if not video_data: + if not fatal_if_no_video: + return webpage, False m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) if m_msg is not None: raise ExtractorError( @@ -212,10 +245,13 @@ class FacebookIE(InfoExtractor): for src_type in ('src', 'src_no_ratelimit'): src = f[0].get('%s_%s' % (quality, src_type)) if src: + preference = -10 if format_id == 'progressive' else 0 + if quality == 'hd': + preference += 5 formats.append({ 'format_id': '%s_%s_%s' % (format_id, quality, src_type), 'url': src, - 'preference': -10 if format_id == 'progressive' else 0, + 'preference': preference, }) dash_manifest = f[0].get('dash_manifest') if dash_manifest: @@ -238,39 +274,36 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) - return { + info_dict = { 'id': video_id, 'title': video_title, 'formats': formats, 'uploader': uploader, } - -class FacebookPostIE(InfoExtractor): - IE_NAME = 'facebook:post' - _VALID_URL = r'https?://(?:\w+\.)?facebook\.com/[^/]+/posts/(?P\d+)' - _TEST = { - 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', - 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', - 'info_dict': { - 'id': '544765982287235', - 'ext': 'mp4', - 'title': '"What are you doing running in the snow?"', - 'uploader': 'FailArmy', - } - } + return webpage, info_dict def _real_extract(self, url): - post_id = self._match_id(url) + video_id = self._match_id(url) - webpage = self._download_webpage(url, post_id) + real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url + webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False) - entries = [ - self.url_result('facebook:%s' % video_id, FacebookIE.ie_key()) - for video_id in self._parse_json( - self._search_regex( - r'(["\'])video_ids\1\s*:\s*(?P\[.+?\])', - webpage, 'video ids', group='ids'), - post_id)] + if info_dict: + return info_dict - return self.playlist_result(entries, post_id) + if '/posts/' in url: + entries = [ + self.url_result('facebook:%s' % vid, FacebookIE.ie_key()) + for vid in self._parse_json( + self._search_regex( + r'(["\'])video_ids\1\s*:\s*(?P\[.+?\])', + webpage, 'video ids', group='ids'), + video_id)] + + return self.playlist_result(entries, video_id) + else: + _, info_dict = self._extract_from_url( + self._VIDEO_PAGE_TEMPLATE % video_id, + video_id, fatal_if_no_video=True) + return info_dict diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 9580f5c0c..c7d69ff1f 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -5,19 +5,18 @@ import hashlib from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_urllib_request, compat_urlparse, ) from ..utils import ( - encode_dict, ExtractorError, sanitized_Request, + urlencode_postdata, ) class FC2IE(InfoExtractor): - _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' + _VALID_URL = r'^https?://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ @@ -57,7 +56,7 @@ class FC2IE(InfoExtractor): 'Submit': ' Login ', } - login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8') + login_data = urlencode_postdata(login_form_strs) request = sanitized_Request( 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py index 298227d57..e8936cb24 100644 --- a/youtube_dl/extractor/firstpost.py +++ b/youtube_dl/extractor/firstpost.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class FirstpostIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' _TEST = { 'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html', diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 510d4b108..88bca1007 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -2,78 +2,133 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_xpath +from ..utils import ( + int_or_none, + qualities, + unified_strdate, + xpath_attr, + xpath_element, + xpath_text, + xpath_with_ns, +) class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'http://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P.+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P\d+)' _TESTS = [{ - 'url': 'http://www.1tv.ru/videoarchive/73390', - 'md5': '777f525feeec4806130f4f764bc18a4f', - 'info_dict': { - 'id': '73390', - 'ext': 'mp4', - 'title': 'Олимпийские канатные дороги', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'duration': 149, - 'like_count': int, - 'dislike_count': int, - }, - 'skip': 'Only works from Russia', - }, { + # single format via video_materials.json API 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', - 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', + 'md5': '82a2777648acae812d58b3f5bd42882b', 'info_dict': { 'id': '35930', 'ext': 'mp4', - 'title': 'Наедине со всеми. Людмила Сенчина', - 'description': 'md5:89553aed1d641416001fe8d450f06cb9', + 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', + 'description': 'md5:357933adeede13b202c7c21f91b871b2', 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20150212', 'duration': 2694, }, - 'skip': 'Only works from Russia', + }, { + # multiple formats via video_materials.json API + 'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641', + 'info_dict': { + 'id': '113641', + 'ext': 'mp4', + 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', + 'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20160407', + 'duration': 179, + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, { + # single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API + 'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038', + 'md5': '519d306c5b5669761fd8906c39dbee23', + 'info_dict': { + 'id': '47038', + 'ext': 'mp4', + 'title': '"Побег". Второй сезон. 3 серия', + 'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20120516', + 'duration': 3080, + }, + }, { + 'url': 'http://www.1tv.ru/videoarchive/9967', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, 'Downloading page') + # Videos with multiple formats only available via this API + video = self._download_json( + 'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id, + video_id, fatal=False) - video_url = self._html_search_regex( - r'''(?s)(?:jwplayer\('flashvideoportal_1'\)\.setup\({|var\s+playlistObj\s*=).*?'file'\s*:\s*'([^']+)'.*?}\);''', - webpage, 'video URL') + description, thumbnail, upload_date, duration = [None] * 4 - title = self._html_search_regex( - [r'
\s*

([^<]*)', - r"'title'\s*:\s*'([^']+)'"], webpage, 'title') - description = self._html_search_regex( - r'
\s*
 
\s*

([^<]*)

', - webpage, 'description', default=None) or self._html_search_meta( + if video: + item = video[0] + title = item['title'] + quality = qualities(('ld', 'sd', 'hd', )) + formats = [{ + 'url': f['src'], + 'format_id': f.get('name'), + 'quality': quality(f.get('name')), + } for f in item['mbr'] if f.get('src')] + thumbnail = item.get('poster') + else: + # Some videos are not available via video_materials.json + video = self._download_xml( + 'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id, + video_id) + + NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + } + + item = xpath_element(video, './channel/item', fatal=True) + title = xpath_text(item, './title', fatal=True) + formats = [{ + 'url': content.attrib['url'], + } for content in item.findall( + compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')] + thumbnail = xpath_attr( + item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url') + + self._sort_formats(formats) + + webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False) + if webpage: + title = self._html_search_regex( + (r'
\s*

([^<]*)', + r"'title'\s*:\s*'([^']+)'"), + webpage, 'title', default=None) or title + description = self._html_search_regex( + r'
\s*
 
\s*

([^<]*)

', + webpage, 'description', default=None) or self._html_search_meta( 'description', webpage, 'description') - - thumbnail = self._og_search_thumbnail(webpage) - duration = self._og_search_property( - 'video:duration', webpage, - 'video duration', fatal=False) - - like_count = self._html_search_regex( - r'title="Понравилось".*?/> \[(\d+)\]', - webpage, 'like count', default=None) - dislike_count = self._html_search_regex( - r'title="Не понравилось".*?/> \[(\d+)\]', - webpage, 'dislike count', default=None) + thumbnail = thumbnail or self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'video duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) return { 'id': video_id, - 'url': video_url, 'thumbnail': thumbnail, 'title': title, 'description': description, + 'upload_date': upload_date, 'duration': int_or_none(duration), - 'like_count': int_or_none(like_count), - 'dislike_count': int_or_none(dislike_count), + 'formats': formats } diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 2955965d9..6b8345416 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -1,9 +1,11 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_parse_qs, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -16,12 +18,7 @@ from ..utils import ( class FiveMinIE(InfoExtractor): IE_NAME = '5min' - _VALID_URL = r'''(?x) - (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=| - https?://(?:(?:massively|www)\.)?joystiq\.com/video/| - 5min:) - (?P\d+) - ''' + _VALID_URL = r'(?:5min:(?P\d+)(?::(?P\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P.*))' _TESTS = [ { @@ -45,6 +42,7 @@ class FiveMinIE(InfoExtractor): 'title': 'How to Make a Next-Level Fruit Salad', 'duration': 184, }, + 'skip': 'no longer available', }, ] _ERRORS = { @@ -91,20 +89,33 @@ class FiveMinIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + sid = mobj.group('sid') + + if mobj.group('query'): + qs = compat_parse_qs(mobj.group('query')) + if not qs.get('playList'): + raise ExtractorError('Invalid URL', expected=True) + video_id = qs['playList'][0] + if qs.get('sid'): + sid = qs['sid'][0] + embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed page') - sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') - query = compat_urllib_parse.urlencode({ - 'func': 'GetResults', - 'playlist': video_id, - 'sid': sid, - 'isPlayerSeed': 'true', - 'url': embed_url, - }) + if not sid: + embed_page = self._download_webpage(embed_url, video_id, + 'Downloading embed page') + sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') + response = self._download_json( - 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, + 'https://syn.5min.com/handlers/SenseHandler.ashx?' + + compat_urllib_parse_urlencode({ + 'func': 'GetResults', + 'playlist': video_id, + 'sid': sid, + 'isPlayerSeed': 'true', + 'url': embed_url, + }), video_id) if not response['success']: raise ExtractorError( @@ -118,9 +129,7 @@ class FiveMinIE(InfoExtractor): parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) for rendition in info['Renditions']: - if rendition['RenditionType'] == 'm3u8': - formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls')) - elif rendition['RenditionType'] == 'aac': + if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8': continue else: rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 5f6e65dae..a3a291599 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -10,7 +10,7 @@ from ..utils import ( class FKTVIE(InfoExtractor): IE_NAME = 'fernsehkritik.tv' - _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P[0-9]+)(?:/.*)?' + _VALID_URL = r'https?://(?:www\.)?fernsehkritik\.tv/folge-(?P[0-9]+)(?:/.*)?' _TEST = { 'url': 'http://fernsehkritik.tv/folge-1', diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 18f439df9..0a3de1498 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, int_or_none, @@ -42,7 +42,7 @@ class FlickrIE(InfoExtractor): } if secret: query['secret'] = secret - data = self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) + data = self._download_json(self._API_BASE_URL + compat_urllib_parse_urlencode(query), video_id, note) if data['stat'] != 'ok': raise ExtractorError(data['message']) return data diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py index 370fd006f..d2503ae2e 100644 --- a/youtube_dl/extractor/footyroom.py +++ b/youtube_dl/extractor/footyroom.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class FootyRoomIE(InfoExtractor): - _VALID_URL = r'http://footyroom\.com/(?P[^/]+)' + _VALID_URL = r'https?://footyroom\.com/(?P[^/]+)' _TESTS = [{ 'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/', 'info_dict': { diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index fa05af50d..95c1abf94 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -16,6 +16,9 @@ class FOXIE(InfoExtractor): 'title': 'Official Trailer: Gotham', 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', 'duration': 129, + 'timestamp': 1400020798, + 'upload_date': '20140513', + 'uploader': 'NEWA-FNG-FOXCOM', }, 'add_ie': ['ThePlatform'], } diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index 08b8ea362..70c1a815d 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class FoxgayIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' + _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' _TEST = { 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml', 'md5': '80d72beab5d04e1655a56ad37afe6841', diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 318ac013d..b04da2415 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -18,8 +18,8 @@ class FoxNewsIE(AMPIE): 'title': 'Frozen in Time', 'description': '16-year-old girl is size of toddler', 'duration': 265, - # 'timestamp': 1304411491, - # 'upload_date': '20110503', + 'timestamp': 1304411491, + 'upload_date': '20110503', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -32,10 +32,14 @@ class FoxNewsIE(AMPIE): 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", 'description': "Congressman discusses president's plan", 'duration': 292, - # 'timestamp': 1417662047, - # 'upload_date': '20141204', + 'timestamp': 1417662047, + 'upload_date': '20141204', 'thumbnail': 're:^https?://.*\.jpg$', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 0388ba00c..2369f868d 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class FranceInterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P[0-9]+)' _TEST = { 'url': 'http://www.franceinter.fr/player/reecouter?play=793962', 'md5': '4764932e466e6f6c79c317d2e74f6884', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 3f4ac3093..ad94e31f3 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -60,28 +60,31 @@ class FranceTVBaseInfoExtractor(InfoExtractor): video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: formats.extend(self._extract_f4m_formats( - f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id)) + f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', + video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id)) + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, 'format_id': 'rtmp-%s' % format_id, 'ext': 'flv', - 'preference': 1, }) else: - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'preference': -1, - }) + if self._is_valid_url(video_url, video_id, format_id): + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) self._sort_formats(formats) title = info['titre'] subtitle = info.get('sous_titre') if subtitle: title += ' - %s' % subtitle + title = title.strip() subtitles = {} subtitles_list = [{ @@ -125,13 +128,13 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile)\.francetvinfo\.fr/.*/(?P.+)\.html' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P<title>.+)\.html' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', 'info_dict': { 'id': '84981923', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Soir 3', 'upload_date': '20130826', 'timestamp': 1377548400, @@ -139,6 +142,10 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'fr': 'mincount:2', }, }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', 'info_dict': { @@ -155,11 +162,32 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', 'md5': 'f485bda6e185e7d15dbc69b72bae993e', 'info_dict': { - 'id': '556e03339473995ee145930c', + 'id': 'NI_173343', 'ext': 'mp4', 'title': 'Les entreprises familiales : le secret de la réussite', 'thumbnail': 're:^https?://.*\.jpe?g$', - } + 'timestamp': 1433273139, + 'upload_date': '20150602', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, { + 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html', + 'md5': 'f485bda6e185e7d15dbc69b72bae993e', + 'info_dict': { + 'id': 'NI_657393', + 'ext': 'mp4', + 'title': 'Olivier Monthus, réalisateur de "Bretagne, le choix de l’Armor"', + 'description': 'md5:a3264114c9d29aeca11ced113c37b16c', + 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1458300695, + 'upload_date': '20160318', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -172,7 +200,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self.url_result(dmcloud_url, 'DailymotionCloud') video_id, catalogue = self._search_regex( - r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@') + (r'id-video=([^@]+@[^"]+)', + r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + webpage, 'video id').split('@') return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py index c210177f7..1477708bb 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/youtube_dl/extractor/freespeech.py @@ -14,7 +14,7 @@ class FreespeechIE(InfoExtractor): 'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0', 'info_dict': { 'id': 'poKsVCZ64uU', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Obama, Romney Campaign in Colorado Ahead of Debate', 'description': 'Obama, Romney Campaign in Colorado Ahead of Debate', 'uploader': 'freespeechtv', diff --git a/youtube_dl/extractor/freevideo.py b/youtube_dl/extractor/freevideo.py index c7bec027b..cd8423a6f 100644 --- a/youtube_dl/extractor/freevideo.py +++ b/youtube_dl/extractor/freevideo.py @@ -5,7 +5,7 @@ from ..utils import ExtractorError class FreeVideoIE(InfoExtractor): - _VALID_URL = r'^http://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' + _VALID_URL = r'^https?://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' _TEST = { 'url': 'http://www.freevideo.cz/vase-videa/vysukany-zadecek-22033.html', diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 0f37ed786..1eb528f31 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from ..utils import ( clean_html, determine_ext, - encode_dict, int_or_none, sanitized_Request, ExtractorError, @@ -54,10 +53,10 @@ class FunimationIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: return - data = urlencode_postdata(encode_dict({ + data = urlencode_postdata({ 'email_field': username, 'password_field': password, - })) + }) login_request = sanitized_Request('http://www.funimation.com/login', data, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', 'Content-Type': 'application/x-www-form-urlencoded' diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 4c4a87e2a..8c5ffc9e8 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -46,8 +46,8 @@ class FunnyOrDieIE(InfoExtractor): links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0) m3u8_url = self._search_regex( - r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8)\1', - webpage, 'm3u8 url', default=None, group='url') + r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8[^"\']*)\1', + webpage, 'm3u8 url', group='url') formats = [] diff --git a/youtube_dl/extractor/gameinformer.py b/youtube_dl/extractor/gameinformer.py index 25870c131..a66e309de 100644 --- a/youtube_dl/extractor/gameinformer.py +++ b/youtube_dl/extractor/gameinformer.py @@ -2,42 +2,27 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str -from ..utils import int_or_none class GameInformerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gameinformer\.com/(?:[^/]+/)*(?P<id>.+)\.aspx' _TEST = { 'url': 'http://www.gameinformer.com/b/features/archive/2015/09/26/replay-animal-crossing.aspx', + 'md5': '292f26da1ab4beb4c9099f1304d2b071', 'info_dict': { 'id': '4515472681001', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': 'Replay - Animal Crossing', 'description': 'md5:2e211891b215c85d061adc7a4dd2d930', - 'timestamp': 1443457610706, - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'timestamp': 1443457610, + 'upload_date': '20150928', + 'uploader_id': '694940074001', }, } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/694940074001/default_default/index.html?videoId=%s' def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - - bc_api_url = self._search_regex(r"getVideo\('([^']+)'", webpage, 'brightcove api url') - json_data = self._download_json( - bc_api_url + '&video_fields=id,name,shortDescription,publishedDate,videoStillURL,length,IOSRenditions', - display_id) - - return { - 'id': compat_str(json_data['id']), - 'display_id': display_id, - 'url': json_data['IOSRenditions'][0]['url'], - 'title': json_data['name'], - 'description': json_data.get('shortDescription'), - 'timestamp': int_or_none(json_data.get('publishedDate')), - 'duration': int_or_none(json_data.get('length')), - } + brightcove_id = self._search_regex(r"getVideo\('[^']+video_id=(\d+)", webpage, 'brightcove id') + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index f6b9046f9..cbcddcb7c 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -10,7 +10,7 @@ from .youtube import YoutubeIE class GamekingsIE(InfoExtractor): - _VALID_URL = r'http://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' _TESTS = [{ # YouTube embed video 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index b3f1bafcc..4ffdd7515 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index 590ccf526..69058a583 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -13,7 +13,7 @@ from ..utils import ( class GameStarIE(InfoExtractor): - _VALID_URL = r'http://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' _TEST = { 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', 'md5': '96974ecbb7fd8d0d20fca5a00810cea7', diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index c3f031d9c..1e7948ab8 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -9,7 +9,7 @@ from ..utils import ( class GametrailersIE(InfoExtractor): - _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' + _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' _TEST = { 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 3befd3e7b..25e93c9a4 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -3,11 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( remove_end, HEADRequest, sanitized_Request, + urlencode_postdata, ) @@ -123,7 +123,7 @@ class GDCVaultIE(InfoExtractor): 'password': password, } - request = sanitized_Request(login_url, compat_urllib_parse.urlencode(login_form)) + request = sanitized_Request(login_url, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self._download_webpage(request, display_id, 'Logging in') start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') @@ -159,9 +159,10 @@ class GDCVaultIE(InfoExtractor): 'title': title, } + PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>' + xml_root = self._html_search_regex( - r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', - start_page, 'xml root', default=None) + PLAYER_REGEX, start_page, 'xml root', default=None) if xml_root is None: # Probably need to authenticate login_res = self._login(webpage_url, display_id) @@ -171,18 +172,19 @@ class GDCVaultIE(InfoExtractor): start_page = login_res # Grab the url from the authenticated page xml_root = self._html_search_regex( - r'<iframe src="(.*?)player.html.*?".*?</iframe>', - start_page, 'xml root') + PLAYER_REGEX, start_page, 'xml root') xml_name = self._html_search_regex( r'<iframe src=".*?\?xml=(.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', default=None) if xml_name is None: # Fallback to the older format - xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename') + xml_name = self._html_search_regex( + r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', + start_page, 'xml filename') - xml_description_url = xml_root + 'xml/' + xml_name - xml_description = self._download_xml(xml_description_url, display_id) + xml_description = self._download_xml( + '%s/xml/%s' % (xml_root, xml_name), display_id) video_title = xml_description.find('./metadata/title').text video_formats = self._parse_mp4(xml_description) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c6bf8d270..589d1e152 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -47,6 +47,7 @@ from .senateisvp import SenateISVPIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE +from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE @@ -58,6 +59,7 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .instagram import InstagramIE class GenericIE(InfoExtractor): @@ -238,6 +240,35 @@ class GenericIE(InfoExtractor): 'format': 'bestvideo', }, }, + # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 + { + 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', + 'info_dict': { + 'id': 'content', + 'ext': 'mp4', + 'title': 'content', + 'formats': 'mincount:8', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, + # m3u8 served with Content-Type: text/plain + { + 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', + 'info_dict': { + 'id': 'index', + 'ext': 'mp4', + 'title': 'index', + 'upload_date': '20140720', + 'formats': 'mincount:11', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', @@ -375,19 +406,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - # multiple ooyala embeds on SBN network websites - { - 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', - 'info_dict': { - 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok', - 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com', - }, - 'playlist_mincount': 3, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -1093,7 +1111,23 @@ class GenericIE(InfoExtractor): # m3u8 downloads 'skip_download': True, } - } + }, + # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' + # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm + { + 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', + 'info_dict': { + 'id': '4785848093001', + 'ext': 'mp4', + 'title': 'The Cardinal Pell Interview', + 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', + 'uploader': 'GlobeCast Australia - GlobeStream', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, ] def report_following_redirect(self, new_url): @@ -1241,28 +1275,31 @@ class GenericIE(InfoExtractor): full_response = self._request_webpage(request, video_id) head_response = full_response + info_dict = { + 'id': video_id, + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) + } + # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '') - m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type) + content_type = head_response.headers.get('Content-Type', '').lower() + m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) - formats = [] - if m.group('format_id').endswith('mpegurl'): + format_id = m.group('format_id') + if format_id.endswith('mpegurl'): formats = self._extract_m3u8_formats(url, video_id, 'mp4') + elif format_id == 'f4m': + formats = self._extract_f4m_formats(url, video_id) else: formats = [{ 'format_id': m.group('format_id'), 'url': url, 'vcodec': 'none' if m.group('type') == 'audio' else None }] - return { - 'id': video_id, - 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), - 'direct': True, - 'formats': formats, - 'upload_date': upload_date, - } + info_dict['direct'] = True + self._sort_formats(formats) + info_dict['formats'] = formats + return info_dict if not self._downloader.params.get('test', False) and not is_intentional: force = self._downloader.params.get('force_generic_extractor', False) @@ -1282,21 +1319,24 @@ class GenericIE(InfoExtractor): request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) + first_bytes = full_response.read(512) + + # Is it an M3U playlist? + if first_bytes.startswith(b'#EXTM3U'): + info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') + self._sort_formats(info_dict['formats']) + return info_dict + # Maybe it's a direct link to a video? # Be careful not to download the whole thing! - first_bytes = full_response.read(512) if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) - return { - 'id': video_id, - 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + info_dict.update({ 'direct': True, 'url': url, - 'upload_date': upload_date, - } + }) + return info_dict webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) @@ -1309,16 +1349,20 @@ class GenericIE(InfoExtractor): if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): - return self._parse_smil(doc, url, video_id) + smil = self._parse_smil(doc, url, video_id) + self._sort_formats(smil['formats']) + return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): - return { - 'id': video_id, - 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), - 'formats': self._parse_mpd_formats( - doc, video_id, mpd_base_url=url.rpartition('/')[0]), - } + info_dict['formats'] = self._parse_mpd_formats( + doc, video_id, mpd_base_url=url.rpartition('/')[0]) + self._sort_formats(info_dict['formats']) + return info_dict + elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): + info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + self._sort_formats(info_dict['formats']) + return info_dict except compat_xml_parse_error: pass @@ -1633,6 +1677,11 @@ class GenericIE(InfoExtractor): if xhamster_urls: return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') + # Look for embedded TNAFlixNetwork player + tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) + if tnaflix_urls: + return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) @@ -1870,6 +1919,19 @@ class GenericIE(InfoExtractor): self._proto_relative_url(unescapeHTML(mobj.group(1))), 'AdobeTVVideo') + # Look for Vine embeds + mobj = re.search( + r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + + # Look for Instagram embeds + instagram_embed_url = InstagramIE._extract_embed_url(webpage) + if instagram_embed_url is not None: + return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -1979,9 +2041,14 @@ class GenericIE(InfoExtractor): entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4') elif ext == 'mpd': entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) + elif ext == 'f4m': + entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) else: entry_info_dict['url'] = video_url + if entry_info_dict.get('formats'): + self._sort_formats(entry_info_dict['formats']) + entries.append(entry_info_dict) if len(entries) == 1: diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 9561ed5fb..62ff84835 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import unified_strdate class GlideIE(InfoExtractor): @@ -15,26 +16,38 @@ class GlideIE(InfoExtractor): 'ext': 'mp4', 'title': 'Damon Timm\'s Glide message', 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', + 'uploader': 'Damon Timm', + 'upload_date': '20140919', } } def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex( - r'<title>(.*?)', webpage, 'title') - video_url = self.http_scheme() + self._search_regex( - r'', webpage, 'video URL') - thumbnail_url = self._search_regex( - r'(.+?)', webpage, 'title') + video_url = self._proto_relative_url(self._search_regex( + r']+src=(["\'])(?P.+?)\1', + webpage, 'video URL', default=None, + group='url')) or self._og_search_video_url(webpage) + thumbnail = self._proto_relative_url(self._search_regex( + r']+id=["\']video-thumbnail["\'][^>]+src=(["\'])(?P.+?)\1', + webpage, 'thumbnail url', default=None, + group='url')) or self._og_search_thumbnail(webpage) + uploader = self._search_regex( + r']+class=["\']info-name["\'][^>]*>([^<]+)', + webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r']+class="info-date"[^>]*>([^<]+)', + webpage, 'upload date', fatal=False)) return { 'id': video_id, 'title': title, 'url': video_url, 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 37be34091..766fc26d0 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -10,8 +10,8 @@ from ..utils import ( class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P[a-zA-Z0-9_-]{28})' - _TEST = { + _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P[a-zA-Z0-9_-]{28,})' + _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'md5': '881f7700aec4f538571fa1e0eed4a7b6', 'info_dict': { @@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor): 'title': 'Big Buck Bunny.mp4', 'duration': 46, } - } + }, { + # video id is longer than 28 characters + 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', + 'only_matching': True, + }] _FORMATS_EXT = { '5': 'flv', '6': 'flv', @@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P[a-zA-Z0-9_-]{28})', + r']+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P[a-zA-Z0-9_-]{28,})', webpage) if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py new file mode 100644 index 000000000..dad0f3994 --- /dev/null +++ b/youtube_dl/extractor/hbo.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + xpath_element, + int_or_none, + parse_duration, +) + + +class HBOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P[0-9]+)' + _TEST = { + 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', + 'md5': '1c33253f0c7782142c993c0ba62a8753', + 'info_dict': { + 'id': '1437839', + 'ext': 'mp4', + 'title': 'Ep. 64 Clip: Encryption', + } + } + _FORMATS_INFO = { + '1920': { + 'width': 1280, + 'height': 720, + }, + '640': { + 'width': 768, + 'height': 432, + }, + 'highwifi': { + 'width': 640, + 'height': 360, + }, + 'high3g': { + 'width': 640, + 'height': 360, + }, + 'medwifi': { + 'width': 400, + 'height': 224, + }, + 'med3g': { + 'width': 400, + 'height': 224, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_xml( + 'http://render.lv3.hbo.com/data/content/global/videos/data/%s.xml' % video_id, video_id) + title = xpath_text(video_data, 'title', 'title', True) + + formats = [] + for source in xpath_element(video_data, 'videos', 'sources', True): + if source.tag == 'size': + path = xpath_text(source, './/path') + if not path: + continue + width = source.attrib.get('width') + format_info = self._FORMATS_INFO.get(width, {}) + height = format_info.get('height') + fmt = { + 'url': path, + 'format_id': 'http%s' % ('-%dp' % height if height else ''), + 'width': format_info.get('width'), + 'height': height, + } + rtmp = re.search(r'^(?Prtmpe?://[^/]+/(?P.+))/(?Pmp4:.+)$', path) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': fmt['format_id'].replace('http', 'rtmp'), + }) + formats.append(fmt) + else: + video_url = source.text + if not video_url: + continue + if source.tag == 'tarball': + formats.extend(self._extract_m3u8_formats( + video_url.replace('.tar', '/base_index_w8.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + format_info = self._FORMATS_INFO.get(source.tag, {}) + formats.append({ + 'format_id': 'http-%s' % source.tag, + 'url': video_url, + 'width': format_info.get('width'), + 'height': format_info.get('height'), + }) + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + + thumbnails = [] + card_sizes = xpath_element(video_data, 'titleCardSizes') + if card_sizes is not None: + for size in card_sizes: + path = xpath_text(size, 'path') + if not path: + continue + width = int_or_none(size.get('width')) + thumbnails.append({ + 'id': width, + 'url': path, + 'width': width, + }) + + return { + 'id': video_id, + 'title': title, + 'duration': parse_duration(xpath_element(video_data, 'duration/tv14')), + 'formats': formats, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 31e219945..9db565209 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -3,16 +3,16 @@ from __future__ import unicode_literals import base64 from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, HEADRequest, sanitized_Request, + urlencode_postdata, ) class HotNewHipHopIE(InfoExtractor): - _VALID_URL = r'http://www\.hotnewhiphop\.com/.*\.(?P.*)\.html' + _VALID_URL = r'https?://www\.hotnewhiphop\.com/.*\.(?P.*)\.html' _TEST = { 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', 'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96', @@ -35,7 +35,7 @@ class HotNewHipHopIE(InfoExtractor): r'"contentUrl" content="(.*?)"', webpage, 'content URL') return self.url_result(video_url, ie='Youtube') - reqdata = compat_urllib_parse.urlencode([ + reqdata = urlencode_postdata([ ('mediaType', 's'), ('mediaId', video_id), ]) diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index 663e6632a..76b74c51d 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -6,6 +6,7 @@ from ..utils import ( int_or_none, js_to_json, unescapeHTML, + determine_ext, ) @@ -39,7 +40,7 @@ class HowStuffWorksIE(InfoExtractor): 'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm', 'info_dict': { 'id': '440011', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Sword Swallowing #1 by Dan Meyer', 'description': 'Video footage (1 of 3) used by permission of the owner Dan Meyer through Sword Swallowers Association International ', 'display_id': 'sword-swallowing-1-by-dan-meyer', @@ -63,13 +64,19 @@ class HowStuffWorksIE(InfoExtractor): video_id = clip_info['content_id'] formats = [] m3u8_url = clip_info.get('m3u8') - if m3u8_url: - formats += self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + if m3u8_url and determine_ext(m3u8_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', format_id='hls', fatal=True)) + flv_url = clip_info.get('flv_url') + if flv_url: + formats.append({ + 'url': flv_url, + 'format_id': 'flv', + }) for video in clip_info.get('mp4', []): formats.append({ 'url': video['src'], - 'format_id': video['bitrate'], - 'vbr': int(video['bitrate'].rstrip('k')), + 'format_id': 'mp4-%s' % video['bitrate'], + 'vbr': int_or_none(video['bitrate'].rstrip('k')), }) if not formats: @@ -102,6 +109,6 @@ class HowStuffWorksIE(InfoExtractor): 'title': unescapeHTML(clip_info['clip_title']), 'description': unescapeHTML(clip_info.get('caption')), 'thumbnail': clip_info.get('video_still_url'), - 'duration': clip_info.get('duration'), + 'duration': int_or_none(clip_info.get('duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index b3706fe6d..f7c913054 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -4,7 +4,7 @@ import json import time from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, sanitized_Request, @@ -12,7 +12,7 @@ from ..utils import ( class HypemIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?hypem\.com/track/(?P[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?hypem\.com/track/(?P[^/]+)/' _TEST = { 'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME', 'md5': 'b9cc91b5af8995e9f0c1cee04c575828', @@ -28,7 +28,7 @@ class HypemIE(InfoExtractor): track_id = self._match_id(url) data = {'ax': 1, 'ts': time.time()} - request = sanitized_Request(url + '?' + compat_urllib_parse.urlencode(data)) + request = sanitized_Request(url + '?' + compat_urllib_parse_urlencode(data)) response, urlh = self._download_webpage_handle( request, track_id, 'Downloading webpage with the url') diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 02e1e428e..8bed8ccd0 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -12,7 +12,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P\d+)' _TEST = { 'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -42,7 +42,7 @@ class ImdbIE(InfoExtractor): for f_url, f_name in extra_formats] format_pages.append(player_page) - quality = qualities(['SD', '480p', '720p']) + quality = qualities(('SD', '480p', '720p', '1080p')) formats = [] for format_page in format_pages: json_data = self._search_regex( @@ -70,7 +70,7 @@ class ImdbIE(InfoExtractor): class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'http://www\.imdb\.com/list/(?P[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://www\.imdb\.com/list/(?P[\da-zA-Z_-]{11})' _TEST = { 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', 'info_dict': { diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 12fb5e8e1..9622f198a 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -73,7 +73,7 @@ class IndavideoEmbedIE(InfoExtractor): 'url': self._proto_relative_url(thumbnail) } for thumbnail in video.get('thumbnails', [])] - tags = [tag['title'] for tag in video.get('tags', [])] + tags = [tag['title'] for tag in video.get('tags') or []] return { 'id': video.get('id') or video_id, diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 016af2084..cca0b8a93 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -4,15 +4,12 @@ from __future__ import unicode_literals import base64 -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_parse_qs, -) +from ..compat import compat_urllib_parse_unquote from ..utils import determine_ext +from .bokecc import BokeCCBaseIE -class InfoQIE(InfoExtractor): +class InfoQIE(BokeCCBaseIE): _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P[^/]+)' _TESTS = [{ @@ -38,26 +35,6 @@ class InfoQIE(InfoExtractor): }, }] - def _extract_bokecc_videos(self, webpage, video_id): - # TODO: bokecc.com is a Chinese video cloud platform - # It should have an independent extractor but I don't have other - # examples using bokecc - player_params_str = self._html_search_regex( - r']+src="http://p\.bokecc\.com/player\?([^"]+)', - webpage, 'player params', default=None) - - player_params = compat_parse_qs(player_params_str) - - info_xml = self._download_xml( - 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( - player_params['siteid'][0], player_params['vid'][0]), video_id) - - return [{ - 'format_id': 'bokecc', - 'url': quality.find('./copy').attrib['playurl'], - 'preference': int(quality.attrib['value']), - } for quality in info_xml.findall('./video/quality')] - def _extract_rtmp_videos(self, webpage): # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' @@ -101,7 +78,7 @@ class InfoQIE(InfoExtractor): if '/cn/' in url: # for China videos, HTTP video URL exists but always fails with 403 - formats = self._extract_bokecc_videos(webpage, video_id) + formats = self._extract_bokecc_formats(webpage, video_id) else: formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index ed3e07118..11bb58d8a 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -4,8 +4,10 @@ import re from .common import InfoExtractor from ..utils import ( + get_element_by_attribute, int_or_none, limit_length, + lowercase_escape, ) @@ -38,6 +40,18 @@ class InstagramIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_embed_url(webpage): + blockquote_el = get_element_by_attribute( + 'class', 'instagram-media', webpage) + if blockquote_el is None: + return + + mobj = re.search( + r']+href=([\'"])(?P[^\'"]+)\1', blockquote_el) + if mobj: + return mobj.group('link') + def _real_extract(self, url): video_id = self._match_id(url) @@ -46,6 +60,8 @@ class InstagramIE(InfoExtractor): webpage, 'uploader id', fatal=False) desc = self._search_regex( r'"caption":"(.+?)"', webpage, 'description', default=None) + if desc is not None: + desc = lowercase_escape(desc) return { 'id': video_id, @@ -136,7 +152,7 @@ class InstagramUserIE(InfoExtractor): if not page['items']: break - max_id = page['items'][-1]['id'] + max_id = page['items'][-1]['id'].split('_')[0] media_url = ( 'http://instagram.com/%s/media?max_id=%s' % ( uploader_id, max_id)) diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 483cc6f9e..45add007f 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,93 +1,91 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urlparse, - compat_urllib_parse, ) from ..utils import ( - xpath_with_ns, + determine_ext, + int_or_none, + xpath_text, ) class InternetVideoArchiveIE(InfoExtractor): - _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' + _VALID_URL = r'https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?' _TEST = { - 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?customerid=69249&publishedid=194487&reporttag=vdbetatitle&playerid=641&autolist=0&domain=www.videodetective.com&maxrate=high&minrate=low&socialplayer=false', 'info_dict': { - 'id': '452693', + 'id': '194487', 'ext': 'mp4', - 'title': 'SKYFALL', - 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', - 'duration': 152, + 'title': 'KICK-ASS 2', + 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } @staticmethod - def _build_url(query): - return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + def _build_json_url(query): + return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query @staticmethod - def _clean_query(query): - NEEDED_ARGS = ['publishedid', 'customerid'] - query_dic = compat_urlparse.parse_qs(query) - cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS) - # Other player ids return m3u8 urls - cleaned_dic['playerid'] = '247' - cleaned_dic['videokbrate'] = '100000' - return compat_urllib_parse.urlencode(cleaned_dic) + def _build_xml_url(query): + return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query def _real_extract(self, url): query = compat_urlparse.urlparse(url).query - query_dic = compat_urlparse.parse_qs(query) + query_dic = compat_parse_qs(query) video_id = query_dic['publishedid'][0] - url = self._build_url(query) - flashconfiguration = self._download_xml(url, video_id, - 'Downloading flash configuration') - file_url = flashconfiguration.find('file').text - file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') - # Replace some of the parameters in the query to get the best quality - # and http links (no m3u8 manifests) - file_url = re.sub(r'(?<=\?)(.+)$', - lambda m: self._clean_query(m.group()), - file_url) - info = self._download_xml(file_url, video_id, - 'Downloading video info') - item = info.find('channel/item') + if '/player/' in url: + configuration = self._download_json(url, video_id) - def _bp(p): - return xpath_with_ns( - p, - { - 'media': 'http://search.yahoo.com/mrss/', - 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats', - } - ) - formats = [] - for content in item.findall(_bp('media:group/media:content')): - attr = content.attrib - f_url = attr['url'] - width = int(attr['width']) - bitrate = int(attr['bitrate']) - format_id = '%d-%dk' % (width, bitrate) - formats.append({ - 'format_id': format_id, - 'url': f_url, - 'width': width, - 'tbr': bitrate, - }) + # There are multiple videos in the playlist whlie only the first one + # matches the video played in browsers + video_info = configuration['playlist'][0] - self._sort_formats(formats) + formats = [] + for source in video_info['sources']: + file_url = source['file'] + if determine_ext(file_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, ext='mp4', m3u8_id='hls')) + else: + a_format = { + 'url': file_url, + } + + if source.get('label') and source['label'][-4:] == ' kbs': + tbr = int_or_none(source['label'][:-4]) + a_format.update({ + 'tbr': tbr, + 'format_id': 'http-%d' % tbr, + }) + formats.append(a_format) + + self._sort_formats(formats) + + title = video_info['title'] + description = video_info.get('description') + thumbnail = video_info.get('image') + else: + configuration = self._download_xml(url, video_id) + formats = [{ + 'url': xpath_text(configuration, './file', 'file URL', fatal=True), + }] + thumbnail = xpath_text(configuration, './image', 'thumbnail') + title = 'InternetVideoArchive video %s' % video_id + description = None return { 'id': video_id, - 'title': item.find('title').text, + 'title': title, 'formats': formats, - 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], - 'description': item.find('description').text, - 'duration': int(attr['duration']), + 'thumbnail': thumbnail, + 'description': description, } diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 61a0de472..788bbe0d5 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -6,6 +6,8 @@ import time from .common import InfoExtractor from ..utils import ( + determine_ext, + js_to_json, sanitized_Request, ) @@ -30,8 +32,7 @@ class IPrimaIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -43,9 +44,42 @@ class IPrimaIE(InfoExtractor): req.add_header('Referer', url) playerpage = self._download_webpage(req, video_id, note='Downloading player') - m3u8_url = self._search_regex(r"'src': '([^']+\.m3u8)'", playerpage, 'm3u8 url') + formats = [] - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + def extract_formats(format_url, format_key=None, lang=None): + ext = determine_ext(format_url) + new_formats = [] + if format_key == 'hls' or ext == 'm3u8': + new_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + elif format_key == 'dash' or ext == 'mpd': + return + new_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) + if lang: + for f in new_formats: + if not f.get('language'): + f['language'] = lang + formats.extend(new_formats) + + options = self._parse_json( + self._search_regex( + r'(?s)var\s+playerOptions\s*=\s*({.+?});', + playerpage, 'player options', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if options: + for key, tracks in options.get('tracks', {}).items(): + if not isinstance(tracks, list): + continue + for track in tracks: + src = track.get('src') + if src: + extract_formats(src, key.lower(), track.get('lang')) + + if not formats: + for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage): + extract_formats(src) self._sort_formats(formats) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index c3e33009a..88570f261 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -14,10 +14,11 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( + decode_packed_codes, ExtractorError, ohdave_rsa_encrypt, remove_start, @@ -126,43 +127,11 @@ class IqiyiSDK(object): class IqiyiSDKInterpreter(object): - BASE62_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - def __init__(self, sdk_code): self.sdk_code = sdk_code - @classmethod - def base62(cls, num): - if num == 0: - return '0' - ret = '' - while num: - ret = cls.BASE62_TABLE[num % 62] + ret - num = num // 62 - return ret - - def decode_eval_codes(self): - self.sdk_code = self.sdk_code[5:-3] - - mobj = re.search( - r"'([^']+)',62,(\d+),'([^']+)'\.split\('\|'\),[^,]+,{}", - self.sdk_code) - obfucasted_code, count, symbols = mobj.groups() - count = int(count) - symbols = symbols.split('|') - symbol_table = {} - - while count: - count -= 1 - b62count = self.base62(count) - symbol_table[b62count] = symbols[count] or b62count - - self.sdk_code = re.sub( - r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], - obfucasted_code) - def run(self, target, ip, timestamp): - self.decode_eval_codes() + self.sdk_code = decode_packed_codes(self.sdk_code) functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code) @@ -196,7 +165,7 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html' + _VALID_URL = r'https?://(?:[^.]+\.)?iqiyi\.com/.+\.html' _NETRC_MACHINE = 'iqiyi' @@ -353,7 +322,7 @@ class IqiyiIE(InfoExtractor): 'bird_t': timestamp, } validation_result = self._download_json( - 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None, + 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse_urlencode(validation_params), None, note='Validate credentials', errnote='Unable to validate credentials') MSG_MAP = { @@ -399,7 +368,10 @@ class IqiyiIE(InfoExtractor): auth_req, video_id, note='Downloading video authentication JSON', errnote='Unable to download video authentication JSON') - if auth_result['code'] == 'Q00506': # requires a VIP account + + if auth_result['code'] == 'Q00505': # No preview available (不允许试看鉴权失败) + raise ExtractorError('This video requires a VIP account', expected=True) + if auth_result['code'] == 'Q00506': # End of preview time (试看结束鉴权失败) if do_report_warning: self.report_warning('Needs a VIP account for full video') return False @@ -487,7 +459,7 @@ class IqiyiIE(InfoExtractor): 'QY00001': auth_result['data']['u'], }) api_video_url += '?' if '?' not in api_video_url else '&' - api_video_url += compat_urllib_parse.urlencode(param) + api_video_url += compat_urllib_parse_urlencode(param) js = self._download_json( api_video_url, video_id, note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) @@ -525,14 +497,14 @@ class IqiyiIE(InfoExtractor): } api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ - compat_urllib_parse.urlencode(param) + compat_urllib_parse_urlencode(param) raw_data = self._download_json(api_url, video_id) return raw_data - def get_enc_key(self, swf_url, video_id): + def get_enc_key(self, video_id): # TODO: automatic key extraction # last update at 2016-01-22 for Zombie::bite - enc_key = '6ab6d0280511493ba85594779759d4ed' + enc_key = '4a1caba4b4465345366f28da7c117d20' return enc_key def _extract_playlist(self, webpage): @@ -582,11 +554,9 @@ class IqiyiIE(InfoExtractor): r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') video_id = self._search_regex( r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') - swf_url = self._search_regex( - r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL') _uuid = uuid.uuid4().hex - enc_key = self.get_enc_key(swf_url, video_id) + enc_key = self.get_enc_key(video_id) raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) diff --git a/youtube_dl/extractor/ivideon.py b/youtube_dl/extractor/ivideon.py index 617dc8c07..3ca824f79 100644 --- a/youtube_dl/extractor/ivideon.py +++ b/youtube_dl/extractor/ivideon.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import qualities @@ -62,7 +62,7 @@ class IvideonIE(InfoExtractor): quality = qualities(self._QUALITIES) formats = [{ - 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse.urlencode({ + 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse_urlencode({ 'server': server_id, 'camera': camera_id, 'sessionId': 'demo', diff --git a/youtube_dl/extractor/jadorecettepub.py b/youtube_dl/extractor/jadorecettepub.py index 063e86de4..158c09a33 100644 --- a/youtube_dl/extractor/jadorecettepub.py +++ b/youtube_dl/extractor/jadorecettepub.py @@ -9,7 +9,7 @@ from .youtube import YoutubeIE class JadoreCettePubIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P.*?)\.html' + _VALID_URL = r'https?://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P.*?)\.html' _TEST = { 'url': 'http://www.jadorecettepub.com/2010/12/star-wars-massacre-par-les-japonais.html', diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index eef7daa29..1a4227f6b 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class JeuxVideoIE(InfoExtractor): - _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' + _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' _TESTS = [{ 'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', @@ -30,7 +30,7 @@ class JeuxVideoIE(InfoExtractor): webpage = self._download_webpage(url, title) title = self._html_search_meta('name', webpage) or self._og_search_title(webpage) config_url = self._html_search_regex( - r'data-src="(/contenu/medias/video.php.*?)"', + r'data-src(?:set-video)?="(/contenu/medias/video.php.*?)"', webpage, 'config URL') config_url = 'http://www.jeuxvideo.com' + config_url diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 8e90d5986..8a5e562db 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,10 +4,58 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + float_or_none, + int_or_none, +) -class JWPlatformIE(InfoExtractor): +class JWPlatformBaseIE(InfoExtractor): + def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True): + video_data = jwplayer_data['playlist'][0] + + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + source_type = source.get('type') or '' + if source_type in ('application/vnd.apple.mpegurl', 'hls'): + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', 'm3u8_native', fatal=False)) + elif source_type.startswith('audio'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + }) + else: + formats.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + self._sort_formats(formats) + + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if track.get('file') and track.get('kind') == 'captions': + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track['file']) + }) + + return { + 'id': video_id, + 'title': video_data['title'] if require_title else video_data.get('title'), + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration')), + 'subtitles': subtitles, + 'formats': formats, + } + + +class JWPlatformIE(JWPlatformBaseIE): _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P[a-zA-Z0-9]{8})' _TEST = { 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', @@ -33,38 +81,4 @@ class JWPlatformIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) - video_data = json_data['playlist'][0] - subtitles = {} - for track in video_data['tracks']: - if track['kind'] == 'captions': - subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}] - - formats = [] - for source in video_data['sources']: - source_url = self._proto_relative_url(source['file']) - source_type = source.get('type') or '' - if source_type == 'application/vnd.apple.mpegurl': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', 'm3u8_native', fatal=False)) - elif source_type.startswith('audio'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - }) - else: - formats.append({ - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_data['title'], - 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), - 'timestamp': int_or_none(video_data.get('pubdate')), - 'subtitles': subtitles, - 'formats': formats, - } + return self._parse_jwplayer_data(json_data, video_id) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index ccbc39c66..a65697ff5 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -6,8 +6,9 @@ import base64 from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, + compat_parse_qs, ) from ..utils import ( clean_html, @@ -20,21 +21,17 @@ from ..utils import ( class KalturaIE(InfoExtractor): _VALID_URL = r'''(?x) (?: - kaltura:(?P\d+):(?P[0-9a-z_]+)| + kaltura:(?P\d+):(?P[0-9a-z_]+)| https?:// (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/ (?: (?: # flash player - index\.php/kwidget/ - (?:[^/]+/)*?wid/_(?P\d+)/ - (?:[^/]+/)*?entry_id/(?P[0-9a-z_]+)| + index\.php/kwidget| # html5 player - html5/html5lib/ - (?:[^/]+/)*?entry_id/(?P[0-9a-z_]+) - .*\?.*\bwid=_(?P\d+) + html5/html5lib/[^/]+/mwEmbedFrame\.php ) - ) + )(?:/(?P[^?]+))?(?:\?(?P.*))? ) ''' _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?' @@ -74,7 +71,7 @@ class KalturaIE(InfoExtractor): for k, v in a.items(): params['%d:%s' % (i, k)] = v - query = compat_urllib_parse.urlencode(params) + query = compat_urllib_parse_urlencode(params) url = self._API_BASE + query data = self._download_json(url, video_id, *args, **kwargs) @@ -127,10 +124,41 @@ class KalturaIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5') - entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5') - - info, flavor_assets = self._get_video_info(entry_id, partner_id) + partner_id, entry_id = mobj.group('partner_id', 'id') + ks = None + if partner_id and entry_id: + info, flavor_assets = self._get_video_info(entry_id, partner_id) + else: + path, query = mobj.group('path', 'query') + if not path and not query: + raise ExtractorError('Invalid URL', expected=True) + params = {} + if query: + params = compat_parse_qs(query) + if path: + splitted_path = path.split('/') + params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]])))) + if 'wid' in params: + partner_id = params['wid'][0][1:] + elif 'p' in params: + partner_id = params['p'][0] + else: + raise ExtractorError('Invalid URL', expected=True) + if 'entry_id' in params: + entry_id = params['entry_id'][0] + info, flavor_assets = self._get_video_info(entry_id, partner_id) + elif 'uiconf_id' in params and 'flashvars[referenceId]' in params: + reference_id = params['flashvars[referenceId]'][0] + webpage = self._download_webpage(url, reference_id) + entry_data = self._parse_json(self._search_regex( + r'window\.kalturaIframePackageData\s*=\s*({.*});', + webpage, 'kalturaIframePackageData'), + reference_id)['entryResult'] + info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] + entry_id = info['id'] + else: + raise ExtractorError('Invalid URL', expected=True) + ks = params.get('flashvars[ks]', [None])[0] source_url = smuggled_data.get('source_url') if source_url: @@ -140,14 +168,19 @@ class KalturaIE(InfoExtractor): else: referrer = None + def sign_url(unsigned_url): + if ks: + unsigned_url += '/ks/%s' % ks + if referrer: + unsigned_url += '?referrer=%s' % referrer + return unsigned_url + formats = [] for f in flavor_assets: # Continue if asset is not ready if f['status'] != 2: continue - video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id']) - if referrer: - video_url += '?referrer=%s' % referrer + video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id'])) formats.append({ 'format_id': '%(fileExt)s-%(bitrate)s' % f, 'ext': f.get('fileExt'), @@ -160,9 +193,7 @@ class KalturaIE(InfoExtractor): 'width': int_or_none(f.get('width')), 'url': video_url, }) - m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp') - if referrer: - m3u8_url += '?referrer=%s' % referrer + m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp')) formats.extend(self._extract_m3u8_formats( m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index 06daf5a89..b4c30b7f3 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -9,7 +9,7 @@ from ..utils import ( class KaraoketvIE(InfoExtractor): - _VALID_URL = r'http://karaoketv\.co\.il/\?container=songs&id=(?P[0-9]+)' + _VALID_URL = r'https?://karaoketv\.co\.il/\?container=songs&id=(?P[0-9]+)' _TEST = { 'url': 'http://karaoketv.co.il/?container=songs&id=171568', 'info_dict': { diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index bed94bc93..2cb04e533 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -12,7 +12,7 @@ from ..utils import ( class KarriereVideosIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', 'info_dict': { diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py index 08a671fa8..61739efa7 100644 --- a/youtube_dl/extractor/khanacademy.py +++ b/youtube_dl/extractor/khanacademy.py @@ -14,10 +14,10 @@ class KhanAcademyIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.khanacademy.org/video/one-time-pad', - 'md5': '7021db7f2d47d4fff89b13177cb1e8f4', + 'md5': '7b391cce85e758fb94f763ddc1bbb979', 'info_dict': { 'id': 'one-time-pad', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'The one-time pad', 'description': 'The perfect cipher', 'duration': 176, diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index a59c529f4..704bd7b34 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -13,7 +13,7 @@ from ..utils import ( class KontrTubeIE(InfoExtractor): IE_NAME = 'kontrtube' IE_DESC = 'KontrTube.ru - Труба зовёт' - _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P\d+)/(?P[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?kontrtube\.ru/videos/(?P\d+)/(?P[^/]+)/' _TEST = { 'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/', diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py index a602980a1..a574408e5 100644 --- a/youtube_dl/extractor/ku6.py +++ b/youtube_dl/extractor/ku6.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class Ku6IE(InfoExtractor): - _VALID_URL = r'http://v\.ku6\.com/show/(?P[a-zA-Z0-9\-\_]+)(?:\.)*html' + _VALID_URL = r'https?://v\.ku6\.com/show/(?P[a-zA-Z0-9\-\_]+)(?:\.)*html' _TEST = { 'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html', 'md5': '01203549b9efbb45f4b87d55bdea1ed1', diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py new file mode 100644 index 000000000..12cc56e44 --- /dev/null +++ b/youtube_dl/extractor/kusi.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote_plus +from ..utils import ( + int_or_none, + float_or_none, + timeconvert, + update_url_query, + xpath_text, +) + + +class KUSIIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?Pstory/.+|video\?clipId=(?P\d+))' + _TESTS = [{ + 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', + 'md5': 'f926e7684294cf8cb7bdf8858e1b3988', + 'info_dict': { + 'id': '12203019', + 'ext': 'mp4', + 'title': 'Turko Files: Case Closed! & Put On Hold!', + 'duration': 231.0, + 'upload_date': '20160210', + 'timestamp': 1455087571, + 'thumbnail': 're:^https?://.*\.jpg$' + }, + }, { + 'url': 'http://kusi.com/video?clipId=12203019', + 'info_dict': { + 'id': '12203019', + 'ext': 'mp4', + 'title': 'Turko Files: Case Closed! & Put On Hold!', + 'duration': 231.0, + 'upload_date': '20160210', + 'timestamp': 1455087571, + 'thumbnail': 're:^https?://.*\.jpg$' + }, + 'params': { + 'skip_download': True, # Same as previous one + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + clip_id = mobj.group('clipId') + video_id = clip_id or mobj.group('path') + + webpage = self._download_webpage(url, video_id) + + if clip_id is None: + video_id = clip_id = self._html_search_regex( + r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id') + + affiliate_id = self._search_regex( + r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id') + + # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf + xml_url = update_url_query('http://www.kusi.com/build.asp', { + 'buildtype': 'buildfeaturexmlrequest', + 'featureType': 'Clip', + 'featureid': clip_id, + 'affiliateno': affiliate_id, + 'clientgroupid': '1', + 'rnd': int(round(random.random() * 1000000)), + }) + + doc = self._download_xml(xml_url, video_id) + + video_title = xpath_text(doc, 'HEADLINE', fatal=True) + duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) + description = xpath_text(doc, 'ABSTRACT') + thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') + createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) + + quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') + formats = [] + for quality in quality_options: + formats.append({ + 'url': compat_urllib_parse_unquote_plus(quality.attrib['url']), + 'height': int_or_none(quality.attrib.get('height')), + 'width': int_or_none(quality.attrib.get('width')), + 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'description': description, + 'duration': duration, + 'formats': formats, + 'thumbnail': thumbnail, + 'timestamp': createtion_time, + } diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index f641edef8..86c17c931 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -import itertools from .common import InfoExtractor from ..utils import ( get_element_by_id, clean_html, ExtractorError, + InAdvancePagedList, remove_start, ) @@ -23,16 +23,29 @@ class KuwoBaseIE(InfoExtractor): {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} ] - def _get_formats(self, song_id): + def _get_formats(self, song_id, tolerate_ip_deny=False): formats = [] for file_format in self._FORMATS: + headers = {} + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + headers['Ytdl-request-proxy'] = cn_verification_proxy + + query = { + 'format': file_format['ext'], + 'br': file_format.get('br', ''), + 'rid': 'MUSIC_%s' % song_id, + 'type': 'convert_url', + 'response': 'url' + } + song_url = self._download_webpage( - 'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' % - (file_format['ext'], file_format.get('br', ''), song_id), + 'http://antiserver.kuwo.cn/anti.s', song_id, note='Download %s url info' % file_format['format'], + query=query, headers=headers, ) - if song_url == 'IPDeny': + if song_url == 'IPDeny' and not tolerate_ip_deny: raise ExtractorError('This song is blocked in this region', expected=True) if song_url.startswith('http://') or song_url.startswith('https://'): @@ -43,14 +56,14 @@ class KuwoBaseIE(InfoExtractor): 'preference': file_format['preference'], 'abr': file_format.get('abr'), }) - self._sort_formats(formats) + return formats class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' - _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/yinyue/(?P\d+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { @@ -68,12 +81,16 @@ class KuwoIE(KuwoBaseIE): 'id': '6446136', 'ext': 'mp3', 'title': '心', + 'description': 'md5:b2ab6295d014005bfc607525bfc1e38a', 'creator': 'IU', 'upload_date': '20150518', }, 'params': { 'format': 'mp3-320' }, + }, { + 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', + 'only_matching': True, }] def _real_extract(self, url): @@ -94,6 +111,7 @@ class KuwoIE(KuwoBaseIE): lrc_content = None formats = self._get_formats(song_id) + self._sort_formats(formats) album_id = self._html_search_regex( r']+class="album"[^<]+]+href="http://www\.kuwo\.cn/album/(\d+)/"', @@ -125,7 +143,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' IE_DESC = '酷我音乐 - 专辑' - _VALID_URL = r'http://www\.kuwo\.cn/album/(?P\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/album/(?P\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', 'info_dict': { @@ -161,13 +179,11 @@ class KuwoAlbumIE(InfoExtractor): class KuwoChartIE(InfoExtractor): IE_NAME = 'kuwo:chart' IE_DESC = '酷我音乐 - 排行榜' - _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P[^.]+).htm' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P[^.]+).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 'info_dict': { 'id': '香港中文龙虎榜', - 'title': '香港中文龙虎榜', - 'description': 're:\d{4}第\d{2}期', }, 'playlist_mincount': 10, } @@ -178,30 +194,24 @@ class KuwoChartIE(InfoExtractor): url, chart_id, note='Download chart info', errnote='Unable to get chart info') - chart_name = self._html_search_regex( - r']+class="unDis">([^<]+)

', webpage, 'chart name') - - chart_desc = self._html_search_regex( - r']+class="tabDef">(\d{4}第\d{2}期)

', webpage, 'chart desc') - entries = [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r']+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage) + r']+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) ] - return self.playlist_result(entries, chart_id, chart_name, chart_desc) + return self.playlist_result(entries, chart_id) class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' IE_DESC = '酷我音乐 - 歌手' - _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+)' + _VALID_URL = r'https?://www\.kuwo\.cn/mingxing/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 'info_dict': { 'id': 'bruno+mars', 'title': 'Bruno Mars', }, - 'playlist_count': 10, + 'playlist_mincount': 329, }, { 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', 'info_dict': { @@ -212,6 +222,8 @@ class KuwoSingerIE(InfoExtractor): 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540 }] + PAGE_SIZE = 15 + def _real_extract(self, url): singer_id = self._match_id(url) webpage = self._download_webpage( @@ -219,25 +231,28 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get singer info') singer_name = self._html_search_regex( - r'
\s*

([^<]+)([^<]+)

', webpage, 'singer name') - entries = [] - first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True - for page_num in itertools.count(1): + artist_id = self._html_search_regex( + r'data-artistid="(\d+)"', webpage, 'artist id') + + page_count = int(self._html_search_regex( + r'data-page="(\d+)"', webpage, 'page count')) + + def page_func(page_num): webpage = self._download_webpage( - 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), - singer_id, note='Download song list page #%d' % page_num, - errnote='Unable to get song list page #%d' % page_num) + 'http://www.kuwo.cn/artist/contentMusicsAjax', + singer_id, note='Download song list page #%d' % (page_num + 1), + errnote='Unable to get song list page #%d' % (page_num + 1), + query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) - entries.extend([ + return [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r']+class="m_name">]+href="(http://www\.kuwo\.cn/yinyue/\d+)/', + r']+class="name">]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) - ][:10 if first_page_only else None]) + ] - if first_page_only or not re.search(r']+href="[^"]+">下一页', webpage): - break + entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE) return self.playlist_result(entries, singer_id, singer_name) @@ -245,7 +260,7 @@ class KuwoSingerIE(InfoExtractor): class KuwoCategoryIE(InfoExtractor): IE_NAME = 'kuwo:category' IE_DESC = '酷我音乐 - 分类' - _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P\d+?).htm' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P\d+?).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', 'info_dict': { @@ -282,15 +297,21 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' IE_DESC = '酷我音乐 - MV' - _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/mv/(?P\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { 'id': '6480076', - 'ext': 'mkv', - 'title': '我们家MV', + 'ext': 'mp4', + 'title': 'My HouseMV', 'creator': '2PM', }, + # In this video, music URLs (anti.s) are blocked outside China and + # USA, while the MV URL (mvurl) is available globally, so force the MV + # URL for consistent results in different countries + 'params': { + 'format': 'mv', + }, } _FORMATS = KuwoBaseIE._FORMATS + [ {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, @@ -312,7 +333,17 @@ class KuwoMvIE(KuwoBaseIE): else: raise ExtractorError('Unable to find song or singer names') - formats = self._get_formats(song_id) + formats = self._get_formats(song_id, tolerate_ip_deny=True) + + mv_url = self._download_webpage( + 'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id, + song_id, note='Download %s MV URL' % song_id) + formats.append({ + 'url': mv_url, + 'format_id': 'mv', + }) + + self._sort_formats(formats) return { 'id': song_id, diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 5d8ebbeb3..d4fbafece 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -19,7 +19,7 @@ from ..utils import ( class Laola1TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P[a-z]+)-(?P[a-z]+)/[^/]+/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P[a-z]+)-(?P[a-z]+)/(?P[^/]+)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -33,7 +33,7 @@ class Laola1TvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie', 'info_dict': { @@ -47,12 +47,28 @@ class Laola1TvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + }, { + 'url': 'http://www.laola1.tv/de-de/livestream/2016-03-22-belogorie-belgorod-trentino-diatec-lde', + 'info_dict': { + 'id': '487850', + 'display_id': '2016-03-22-belogorie-belgorod-trentino-diatec-lde', + 'ext': 'flv', + 'title': 'Belogorie BELGOROD - TRENTINO Diatec', + 'upload_date': '20160322', + 'uploader': 'CEV - Europäischer Volleyball Verband', + 'is_live': True, + 'categories': ['Volleyball'], + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('slug') + kind = mobj.group('kind') lang = mobj.group('lang') portal = mobj.group('portal') @@ -74,7 +90,7 @@ class Laola1TvIE(InfoExtractor): hd_doc = self._download_xml( 'http://www.laola1.tv/server/hd_video.php?%s' - % compat_urllib_parse.urlencode({ + % compat_urllib_parse_urlencode({ 'play': video_id, 'partner': partner_id, 'portal': portal, @@ -85,12 +101,17 @@ class Laola1TvIE(InfoExtractor): _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) title = _v('title', fatal=True) + VS_TARGETS = { + 'video': '2', + 'livestream': '17', + } + req = sanitized_Request( 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access?%s' % - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'videoId': video_id, - 'target': '2', - 'label': 'laola1tv', + 'target': VS_TARGETS.get(kind, '2'), + 'label': _v('label'), 'area': _v('area'), }), urlencode_postdata( @@ -109,6 +130,7 @@ class Laola1TvIE(InfoExtractor): formats = self._extract_f4m_formats( '%s?hdnea=%s&hdcore=3.2.0' % (token_attrib['url'], token_auth), video_id, f4m_id='hds') + self._sort_formats(formats) categories_str = _v('meta_sports') categories = categories_str.split(',') if categories_str else [] diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index 38d7502df..c3ed9cf68 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor + class LcpIE(InfoExtractor): IE_NAME = 'LCP' _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^\/]+/)*(?P[^/]+)' diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/leeco.py similarity index 82% rename from youtube_dl/extractor/letv.py rename to youtube_dl/extractor/leeco.py index 9665ece89..375fdaed1 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/leeco.py @@ -1,36 +1,39 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import datetime +import hashlib import re import time -import base64 -import hashlib from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_ord, compat_str, + compat_urllib_parse_urlencode, ) from ..utils import ( determine_ext, + encode_data_uri, ExtractorError, + int_or_none, + orderedSet, parse_iso8601, sanitized_Request, - int_or_none, str_or_none, - encode_data_uri, url_basename, ) -class LetvIE(InfoExtractor): +class LeIE(InfoExtractor): IE_DESC = '乐视网' - _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P\d+).html' + _VALID_URL = r'https?://www\.le\.com/ptv/vplay/(?P\d+)\.html' + + _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' _TESTS = [{ - 'url': 'http://www.letv.com/ptv/vplay/22005890.html', + 'url': 'http://www.le.com/ptv/vplay/22005890.html', 'md5': 'edadcfe5406976f42f9f266057ee5e40', 'info_dict': { 'id': '22005890', @@ -42,7 +45,7 @@ class LetvIE(InfoExtractor): 'hls_prefer_native': True, }, }, { - 'url': 'http://www.letv.com/ptv/vplay/1415246.html', + 'url': 'http://www.le.com/ptv/vplay/1415246.html', 'info_dict': { 'id': '1415246', 'ext': 'mp4', @@ -54,7 +57,7 @@ class LetvIE(InfoExtractor): }, }, { 'note': 'This video is available only in Mainland China, thus a proxy is needed', - 'url': 'http://www.letv.com/ptv/vplay/1118082.html', + 'url': 'http://www.le.com/ptv/vplay/1118082.html', 'md5': '2424c74948a62e5f31988438979c5ad1', 'info_dict': { 'id': '1118082', @@ -94,17 +97,16 @@ class LetvIE(InfoExtractor): return encrypted_data encrypted_data = encrypted_data[5:] - _loc4_ = bytearray() - while encrypted_data: - b = compat_ord(encrypted_data[0]) - _loc4_.extend([b // 16, b & 0x0f]) - encrypted_data = encrypted_data[1:] + _loc4_ = bytearray(2 * len(encrypted_data)) + for idx, val in enumerate(encrypted_data): + b = compat_ord(val) + _loc4_[2 * idx] = b // 16 + _loc4_[2 * idx + 1] = b % 16 idx = len(_loc4_) - 11 _loc4_ = _loc4_[idx:] + _loc4_[:idx] - _loc7_ = bytearray() - while _loc4_: - _loc7_.append(_loc4_[0] * 16 + _loc4_[1]) - _loc4_ = _loc4_[2:] + _loc7_ = bytearray(len(encrypted_data)) + for i in range(len(encrypted_data)): + _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1] return bytes(_loc7_) @@ -117,10 +119,10 @@ class LetvIE(InfoExtractor): 'splatid': 101, 'format': 1, 'tkey': self.calc_time_key(int(time.time())), - 'domain': 'www.letv.com' + 'domain': 'www.le.com' } play_json_req = sanitized_Request( - 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params) + 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params) ) cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') if cn_verification_proxy: @@ -149,7 +151,7 @@ class LetvIE(InfoExtractor): for format_id in formats: if format_id in dispatch: media_url = playurl['domain'][0] + dispatch[format_id][0] - media_url += '&' + compat_urllib_parse.urlencode({ + media_url += '&' + compat_urllib_parse_urlencode({ 'm3v': 1, 'format': 1, 'expect': 3, @@ -193,26 +195,51 @@ class LetvIE(InfoExtractor): } -class LetvTvIE(InfoExtractor): - _VALID_URL = r'http://www.letv.com/tv/(?P\d+).html' +class LePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://[a-z]+\.le\.com/[a-z]+/(?P[a-z0-9_]+)' + _TESTS = [{ - 'url': 'http://www.letv.com/tv/46177.html', + 'url': 'http://www.le.com/tv/46177.html', 'info_dict': { 'id': '46177', 'title': '美人天下', 'description': 'md5:395666ff41b44080396e59570dbac01c' }, 'playlist_count': 35 + }, { + 'url': 'http://tv.le.com/izt/wuzetian/index.html', + 'info_dict': { + 'id': 'wuzetian', + 'title': '武媚娘传奇', + 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' + }, + # This playlist contains some extra videos other than the drama itself + 'playlist_mincount': 96 + }, { + 'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml', + # This series is moved to http://www.le.com/tv/10005297.html + 'only_matching': True, + }, { + 'url': 'http://www.le.com/comic/92063.html', + 'only_matching': True, + }, { + 'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url) + def _real_extract(self, url): playlist_id = self._match_id(url) page = self._download_webpage(url, playlist_id) - media_urls = list(set(re.findall( - r'http://www.letv.com/ptv/vplay/\d+.html', page))) - entries = [self.url_result(media_url, ie='Letv') - for media_url in media_urls] + # Currently old domain names are still used in playlists + media_ids = orderedSet(re.findall( + r']+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page)) + entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le') + for media_id in media_ids] title = self._html_search_meta('keywords', page, fatal=False).split(',')[0] @@ -222,31 +249,9 @@ class LetvTvIE(InfoExtractor): playlist_description=description) -class LetvPlaylistIE(LetvTvIE): - _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P[a-z]+)/index.s?html' - _TESTS = [{ - 'url': 'http://tv.letv.com/izt/wuzetian/index.html', - 'info_dict': { - 'id': 'wuzetian', - 'title': '武媚娘传奇', - 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' - }, - # This playlist contains some extra videos other than the drama itself - 'playlist_mincount': 96 - }, { - 'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml', - 'info_dict': { - 'id': 'lswjzzjc', - # The title should be "劲舞青春", but I can't find a simple way to - # determine the playlist title - 'title': '乐视午间自制剧场', - 'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489' - }, - 'playlist_mincount': 7 - }] - - class LetvCloudIE(InfoExtractor): + # Most of *.letv.com is changed to *.le.com on 2016/01/02 + # but yuntv.letv.com is kept, so also keep the extractor name IE_DESC = '乐视云' _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+' @@ -300,7 +305,7 @@ class LetvCloudIE(InfoExtractor): } self.sign_data(data) return self._download_json( - 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse.urlencode(data), + 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data), media_id, 'Downloading playJson data for type %s' % cf) play_json = get_play_json(cf, time.time()) @@ -327,7 +332,7 @@ class LetvCloudIE(InfoExtractor): formats.append({ 'url': url, 'ext': determine_ext(decoded_url), - 'format_id': int_or_none(play_url.get('vtype')), + 'format_id': str_or_none(play_url.get('vtype')), 'format_note': str_or_none(play_url.get('definition')), 'width': int_or_none(play_url.get('vwidth')), 'height': int_or_none(play_url.get('vheight')), diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index f8cbca7b3..ba2f80a75 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -17,21 +17,21 @@ from ..utils import ( class LifeNewsIE(InfoExtractor): IE_NAME = 'lifenews' IE_DESC = 'LIFE | NEWS' - _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P
news|video)/(?P\d+)' + _VALID_URL = r'https?://lifenews\.ru/(?:mobile/)?(?P
news|video)/(?P\d+)' _TESTS = [{ - 'url': 'http://lifenews.ru/news/126342', - 'md5': 'e1b50a5c5fb98a6a544250f2e0db570a', + # single video embedded via video/source + 'url': 'http://lifenews.ru/news/98736', + 'md5': '77c95eaefaca216e32a76a343ad89d23', 'info_dict': { - 'id': '126342', + 'id': '98736', 'ext': 'mp4', - 'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом', - 'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.', - 'thumbnail': 're:http://.*\.jpg', - 'upload_date': '20140130', + 'title': 'Мужчина нашел дома архив оборонного завода', + 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26', + 'upload_date': '20120805', } }, { - # video in