diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d5be8003b..ff626883d 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.11** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.02.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.02.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.11 +[debug] youtube-dl version 2019.02.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.gitignore b/.gitignore index f064a0d9e..c4870a6ba 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,6 @@ youtube-dl.zsh tmp/ venv/ + +# VS Code related files +.vscode diff --git a/.travis.yml b/.travis.yml index 92f326860..79287ccf6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,18 @@ env: - YTDL_TEST_SET=download matrix: include: + - python: 3.7 + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.7 + dist: xenial + env: YTDL_TEST_SET=download + - python: 3.8-dev + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.8-dev + dist: xenial + env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - env: JYTHON=true; YTDL_TEST_SET=download fast_finish: true diff --git a/AUTHORS b/AUTHORS index eaf96d79d..b507cb8df 100644 --- a/AUTHORS +++ b/AUTHORS @@ -239,3 +239,10 @@ Martin Weinelt Surya Oktafendri TingPing Alexandre Macabies +Bastian de Groot +Niklas Haas +András Veres-Szentkirályi +Enes Solak +Nathan Rossi +Thomas van der Berg +Luca Cherubin diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 333acee80..6c1739860 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -152,16 +152,20 @@ After you have ensured this site is distributing its content legally, you can fo ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart): + + $ flake8 youtube_dl/extractor/yourextractor.py + +9. Make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +10. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! @@ -173,7 +177,7 @@ Extractors are very fragile by nature since they depend on the layout of the sou ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) @@ -181,7 +185,7 @@ For extraction to work youtube-dl relies on metadata your extractor extracts and In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example @@ -257,11 +261,33 @@ title = meta.get('title') or self._og_search_title(webpage) This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`. -### Make regular expressions flexible +### Regular expressions -When using regular expressions try to write them fuzzy and flexible. +#### Don't capture groups you don't use + +Capturing group must be an indication that it's used somewhere in the code. Any group that is not used must be non capturing. + +##### Example + +Don't capture id attribute name here since you can't use it for anything anyway. + +Correct: + +```python +r'(?:id|ID)=(?P\d+)' +``` + +Incorrect: +```python +r'(id|ID)=(?P\d+)' +``` + + +#### Make regular expressions relaxed and flexible + +When using regular expressions try to write them fuzzy, relaxed and flexible, skipping insignificant parts that are more likely to change, allowing both single and double quotes for quoted values and so on. -#### Example +##### Example Say you need to extract `title` from the following HTML code: @@ -294,7 +320,49 @@ title = self._search_regex( webpage, 'title', group='title') ``` -### Use safe conversion functions +### Long lines policy -Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +There is a soft limit to keep lines of code under 80 characters long. This means it should be respected if possible and if it does not make readability and code maintenance worse. + +For example, you should **never** split long string literals like URLs or some other often copied entities over multiple lines to fit this limit: + +Correct: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + +Incorrect: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=' +'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + +### Use convenience conversion and parsing functions + +Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + +Use `url_or_none` for safe URL processing. + +Use `try_get` for safe metadata extraction from parsed JSON. + +Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. + +Explore [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. + +#### More examples + +##### Safely extract optional description from parsed JSON +```python +description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str) +``` + +##### Safely extract more optional metadata +```python +video = try_get(response, lambda x: x['result']['video'][0], dict) or {} +description = video.get('summary') +duration = float_or_none(video.get('durationMs'), scale=1000) +view_count = int_or_none(video.get('views')) +``` diff --git a/ChangeLog b/ChangeLog index b808ad6ba..f9dd7928f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,657 @@ +version 2019.02.18 + +Extractors +* [tvp:website] Fix and improve extraction ++ [tvp] Detect unavailable videos +* [tvp] Fix description extraction and make thumbnail optional ++ [linuxacademy] Add support for linuxacademy.com (#12207) +* [bilibili] Update keys (#19233) +* [udemy] Extend URL regular expressions (#14330, #15883) +* [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126) +* [noovo] Fix extraction (#19230) +* [rai] Relax URL regular expression (#19232) ++ [vshare] Pass Referer to download request (#19205, #19221) ++ [openload] Add support for oload.live (#19222) +* [imgur] Use video id as title fallback (#18590) ++ [twitch] Add new source format detection approach (#19193) +* [tvplayhome] Fix video id extraction (#19190) +* [tvplayhome] Fix episode metadata extraction (#19190) +* [rutube:embed] Fix extraction (#19163) ++ [rutube:embed] Add support private videos (#19163) ++ [soundcloud] Extract more metadata ++ [trunews] Add support for trunews.com (#19153) ++ [linkedin:learning] Extract chapter_number and chapter_id (#19162) + + +version 2019.02.08 + +Core +* [utils] Improve JSON-LD regular expression (#18058) +* [YoutubeDL] Fallback to ie_key of matching extractor while making + download archive id when no explicit ie_key is provided (#19022) + +Extractors ++ [malltv] Add support for mall.tv (#18058, #17856) ++ [spankbang:playlist] Add support for playlists (#19145) +* [spankbang] Extend URL regular expression +* [trutv] Fix extraction (#17336) +* [toutv] Fix authentication (#16398, #18700) +* [pornhub] Fix tags and categories extraction (#13720, #19135) +* [pornhd] Fix formats extraction ++ [pornhd] Extract like count (#19123, #19125) +* [radiocanada] Switch to the new media requests (#19115) ++ [teachable] Add support for courses.workitdaily.com (#18871) +- [vporn] Remove extractor (#16276) ++ [soundcloud:pagedplaylist] Add ie and title to entries (#19022, #19086) ++ [drtuber] Extract duration (#19078) +* [soundcloud] Fix paged playlists extraction, add support for albums and update client id +* [soundcloud] Update client id +* [drtv] Improve preference (#19079) ++ [openload] Add support for openload.pw and oload.pw (#18930) ++ [openload] Add support for oload.info (#19073) +* [crackle] Authorize media detail request (#16931) + + +version 2019.01.30.1 + +Core +* [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (#19067) + + +version 2019.01.30 + +Core +* [postprocessor/ffmpeg] Do not copy Apple TV chapter tracks while embedding + subtitles (#19024, #19042) +* [postprocessor/ffmpeg] Disable "Last message repeated" messages (#19025) + +Extractors +* [yourporn] Fix extraction and extract duration (#18815, #18852, #19061) +* [drtv] Improve extraction (#19039) + + Add support for EncryptedUri videos + + Extract more metadata + * Fix subtitles extraction ++ [fox] Add support for locked videos using cookies (#19060) +* [fox] Fix extraction for free videos (#19060) ++ [zattoo] Add support for tv.salt.ch (#19059) + + +version 2019.01.27 + +Core ++ [extractor/common] Extract season in _json_ld +* [postprocessor/ffmpeg] Fallback to ffmpeg/avconv for audio codec detection + (#681) + +Extractors +* [vice] Fix extraction for locked videos (#16248) ++ [wakanim] Detect DRM protected videos ++ [wakanim] Add support for wakanim.tv (#14374) +* [usatoday] Fix extraction for videos with custom brightcove partner id + (#18990) +* [drtv] Fix extraction (#18989) +* [nhk] Extend URL regular expression (#18968) +* [go] Fix Adobe Pass requests for Disney Now (#18901) ++ [openload] Add support for oload.club (#18969) + + +version 2019.01.24 + +Core +* [YoutubeDL] Fix negation for string operators in format selection (#18961) + + +version 2019.01.23 + +Core +* [utils] Fix urljoin for paths with non-http(s) schemes +* [extractor/common] Improve jwplayer relative URL handling (#18892) ++ [YoutubeDL] Add negation support for string comparisons in format selection + expressions (#18600, #18805) +* [extractor/common] Improve HLS video-only format detection (#18923) + +Extractors +* [crunchyroll] Extend URL regular expression (#18955) +* [pornhub] Bypass scrape detection (#4822, #5930, #7074, #10175, #12722, + #17197, #18338 #18842, #18899) ++ [vrv] Add support for authentication (#14307) +* [videomore:season] Fix extraction +* [videomore] Improve extraction (#18908) ++ [tnaflix] Pass Referer in metadata request (#18925) +* [radiocanada] Relax DRM check (#18608, #18609) +* [vimeo] Fix video password verification for videos protected by + Referer HTTP header ++ [hketv] Add support for hkedcity.net (#18696) ++ [streamango] Add support for fruithosts.net (#18710) ++ [instagram] Add support for tags (#18757) ++ [odnoklassniki] Detect paid videos (#18876) +* [ted] Correct acodec for HTTP formats (#18923) +* [cartoonnetwork] Fix extraction (#15664, #17224) +* [vimeo] Fix extraction for password protected player URLs (#18889) + + +version 2019.01.17 + +Extractors +* [youtube] Extend JS player signature function name regular expressions + (#18890, #18891, #18893) + + +version 2019.01.16 + +Core ++ [test/helper] Add support for maxcount and count collection len checkers +* [downloader/hls] Fix uplynk ad skipping (#18824) +* [postprocessor/ffmpeg] Improve ffmpeg version parsing (#18813) + +Extractors +* [youtube] Skip unsupported adaptive stream type (#18804) ++ [youtube] Extract DASH formats from player response (#18804) +* [funimation] Fix extraction (#14089) +* [skylinewebcams] Fix extraction (#18853) ++ [curiositystream] Add support for non app URLs ++ [bitchute] Check formats (#18833) +* [wistia] Extend URL regular expression (#18823) ++ [playplustv] Add support for playplus.com (#18789) + + +version 2019.01.10 + +Core +* [extractor/common] Use episode name as title in _json_ld ++ [extractor/common] Add support for movies in _json_ld +* [postprocessor/ffmpeg] Embed subtitles with non-standard language codes + (#18765) ++ [utils] Add language codes replaced in 1989 revision of ISO 639 + to ISO639Utils (#18765) + +Extractors +* [youtube] Extract live HLS URL from player response (#18799) ++ [outsidetv] Add support for outsidetv.com (#18774) +* [jwplatform] Use JW Platform Delivery API V2 and add support for more URLs ++ [fox] Add support National Geographic (#17985, #15333, #14698) ++ [playplustv] Add support for playplus.tv (#18789) +* [globo] Set GLBID cookie manually (#17346) ++ [gaia] Add support for gaia.com (#14605) +* [youporn] Fix title and description extraction (#18748) ++ [hungama] Add support for hungama.com (#17402, #18771) +* [dtube] Fix extraction (#18741) +* [tvnow] Fix and rework extractors and prepare for a switch to the new API + (#17245, #18499) +* [carambatv:page] Fix extraction (#18739) + + +version 2019.01.02 + +Extractors +* [discovery] Use geo verification headers (#17838) ++ [packtpub] Add support for subscription.packtpub.com (#18718) +* [yourporn] Fix extraction (#18583) ++ [acast:channel] Add support for play.acast.com (#18587) ++ [extractors] Add missing age limits (#18621) ++ [rmcdecouverte] Add support for live stream +* [rmcdecouverte] Bypass geo restriction +* [rmcdecouverte] Update URL regular expression (#18595, 18697) +* [manyvids] Fix extraction (#18604, #18614) +* [bitchute] Fix extraction (#18567) + + +version 2018.12.31 + +Extractors ++ [bbc] Add support for another embed pattern (#18643) ++ [npo:live] Add support for npostart.nl (#18644) +* [beeg] Fix extraction (#18610, #18626) +* [youtube] Unescape HTML for series (#18641) ++ [youtube] Extract more format metadata +* [youtube] Detect DRM protected videos (#1774) +* [youtube] Relax HTML5 player regular expressions (#18465, #18466) +* [youtube] Extend HTML5 player regular expression (#17516) ++ [liveleak] Add support for another embed type and restore original + format extraction ++ [crackle] Extract ISM and HTTP formats ++ [twitter] Pass Referer with card request (#18579) +* [mediasite] Extend URL regular expression (#18558) ++ [lecturio] Add support for lecturio.de (#18562) ++ [discovery] Add support for Scripps Networks watch domains (#17947) + + +version 2018.12.17 + +Extractors +* [ard:beta] Improve geo restricted videos extraction +* [ard:beta] Fix subtitles extraction +* [ard:beta] Improve extraction robustness +* [ard:beta] Relax URL regular expression (#18441) +* [acast] Add support for embed.acast.com and play.acast.com (#18483) +* [iprima] Relax URL regular expression (#18515, #18540) +* [vrv] Fix initial state extraction (#18553) +* [youtube] Fix mark watched (#18546) ++ [safari] Add support for learning.oreilly.com (#18510) +* [youtube] Fix multifeed extraction (#18531) +* [lecturio] Improve subtitles extraction (#18488) +* [uol] Fix format URL extraction (#18480) ++ [ard:mediathek] Add support for classic.ardmediathek.de (#18473) + + +version 2018.12.09 + +Core +* [YoutubeDL] Keep session cookies in cookie file between runs +* [YoutubeDL] Recognize session cookies with expired set to 0 (#12929) + +Extractors ++ [teachable] Add support for teachable platform sites (#5451, #18150, #18272) ++ [aenetworks] Add support for historyvault.com (#18460) +* [imgur] Improve gallery and album detection and extraction (#9133, #16577, + #17223, #18404) +* [iprima] Relax URL regular expression (#18453) +* [hotstar] Fix video data extraction (#18386) +* [ard:mediathek] Fix title and description extraction (#18349, #18371) +* [xvideos] Switch to HTTPS (#18422, #18427) ++ [lecturio] Add support for lecturio.com (#18405) ++ [nrktv:series] Add support for extra materials +* [nrktv:season,series] Fix extraction (#17159, #17258) +* [nrktv] Relax URL regular expression (#18304, #18387) +* [yourporn] Fix extraction (#18424, #18425) +* [tbs] Fix info extraction (#18403) ++ [gamespot] Add support for review URLs + + +version 2018.12.03 + +Core +* [utils] Fix random_birthday to generate existing dates only (#18284) + +Extractors ++ [tiktok] Add support for tiktok.com (#18108, #18135) +* [pornhub] Use actual URL host for requests (#18359) +* [lynda] Fix authentication (#18158, #18217) +* [gfycat] Update API endpoint (#18333, #18343) ++ [hotstar] Add support for alternative app state layout (#18320) +* [azmedien] Fix extraction (#18334, #18336) ++ [vimeo] Add support for VHX (Vimeo OTT) (#14835) +* [joj] Fix extraction (#18280, #18281) ++ [wistia] Add support for fast.wistia.com (#18287) + + +version 2018.11.23 + +Core ++ [setup.py] Add more relevant classifiers + +Extractors +* [mixcloud] Fallback to hardcoded decryption key (#18016) +* [nbc:news] Fix article extraction (#16194) +* [foxsports] Fix extraction (#17543) +* [loc] Relax regular expression and improve formats extraction ++ [ciscolive] Add support for ciscolive.cisco.com (#17984) +* [nzz] Relax kaltura regex (#18228) +* [sixplay] Fix formats extraction +* [bitchute] Improve title extraction +* [kaltura] Limit requested MediaEntry fields ++ [americastestkitchen] Add support for zype embeds (#18225) ++ [pornhub] Add pornhub.net alias +* [nova:embed] Fix extraction (#18222) + + +version 2018.11.18 + +Extractors ++ [wwe] Extract subtitles ++ [wwe] Add support for playlistst (#14781) ++ [wwe] Add support for wwe.com (#14781, #17450) +* [vk] Detect geo restriction (#17767) +* [openload] Use original host during extraction (#18211) +* [atvat] Fix extraction (#18041) ++ [rte] Add support for new API endpoint (#18206) +* [tnaflixnetwork:embed] Fix extraction (#18205) +* [picarto] Use API and add token support (#16518) ++ [zype] Add support for player.zype.com (#18143) +* [vivo] Fix extraction (#18139) +* [ruutu] Update API endpoint (#18138) + + +version 2018.11.07 + +Extractors ++ [youtube] Add another JS signature function name regex (#18091, #18093, + #18094) +* [facebook] Fix tahoe request (#17171) +* [cliphunter] Fix extraction (#18083) ++ [youtube:playlist] Add support for invidio.us (#18077) +* [zattoo] Arrange API hosts for derived extractors (#18035) ++ [youtube] Add fallback metadata extraction from videoDetails (#18052) + + +version 2018.11.03 + +Core +* [extractor/common] Ensure response handle is not prematurely closed before + it can be read if it matches expected_status (#17195, #17846, #17447) + +Extractors +* [laola1tv:embed] Set correct stream access URL scheme (#16341) ++ [ehftv] Add support for ehftv.com (#15408) +* [azmedien] Adopt to major site redesign (#17745, #17746) ++ [twitcasting] Add support for twitcasting.tv (#17981) +* [orf:tvthek] Fix extraction (#17737, #17956, #18024) ++ [openload] Add support for oload.fun (#18045) +* [njpwworld] Fix authentication (#17427) ++ [linkedin:learning] Add support for linkedin.com/learning (#13545) +* [theplatform] Improve error detection (#13222) +* [cnbc] Simplify extraction (#14280, #17110) ++ [cbnc] Add support for new URL schema (#14193) +* [aparat] Improve extraction and extract more metadata (#17445, #18008) +* [aparat] Fix extraction + + +version 2018.10.29 + +Core ++ [extractor/common] Add validation for JSON-LD URLs + +Extractors ++ [sportbox] Add support for matchtv.ru +* [sportbox] Fix extraction (#17978) +* [screencast] Fix extraction (#14590, #14617, #17990) ++ [openload] Add support for oload.icu ++ [ivi] Add support for ivi.tv +* [crunchyroll] Improve extraction failsafeness (#17991) +* [dailymail] Fix formats extraction (#17976) +* [viewster] Reduce format requests +* [cwtv] Handle API errors (#17905) ++ [rutube] Use geo verification headers (#17897) ++ [brightcove:legacy] Add fallbacks to brightcove:new (#13912) +- [tv3] Remove extractor (#10461, #15339) +* [ted] Fix extraction for HTTP and RTMP formats (#5941, #17572, #17894) ++ [openload] Add support for oload.cc (#17823) ++ [patreon] Extract post_file URL (#17792) +* [patreon] Fix extraction (#14502, #10471) + + +version 2018.10.05 + +Extractors +* [pluralsight] Improve authentication (#17762) +* [dailymotion] Fix extraction (#17699) +* [crunchyroll] Switch to HTTPS for RpcApi (#17749) ++ [philharmoniedeparis] Add support for pad.philharmoniedeparis.fr (#17705) +* [philharmoniedeparis] Fix extraction (#17705) ++ [jamendo] Add support for licensing.jamendo.com (#17724) ++ [openload] Add support for oload.cloud (#17710) +* [pluralsight] Fix subtitles extraction (#17726, #17728) ++ [vimeo] Add another config regular expression (#17690) +* [spike] Fix Paramount Network extraction (#17677) +* [hotstar] Fix extraction (#14694, #14931, #17637) + + +version 2018.09.26 + +Extractors +* [pluralsight] Fix subtitles extraction (#17671) +* [mediaset] Improve embed support (#17668) ++ [youtube] Add support for invidio.us (#17613) ++ [zattoo] Add support for more zattoo platform sites +* [zattoo] Fix extraction (#17175, #17542) + + +version 2018.09.18 + +Core ++ [extractor/common] Introduce channel meta fields + +Extractors +* [adobepass] Don't pollute default headers dict +* [udemy] Don't pollute default headers dict +* [twitch] Don't pollute default headers dict +* [youtube] Don't pollute default query dict (#17593) +* [crunchyroll] Prefer hardsubless formats and formats in locale language +* [vrv] Make format ids deterministic +* [vimeo] Fix ondemand playlist extraction (#14591) ++ [pornhub] Extract upload date (#17574) ++ [porntube] Extract channel meta fields ++ [vimeo] Extract channel meta fields ++ [youtube] Extract channel meta fields (#9676, #12939) +* [porntube] Fix extraction (#17541) +* [asiancrush] Fix extraction (#15630) ++ [twitch:clips] Extend URL regular expression (closes #17559) ++ [vzaar] Add support for HLS +* [tube8] Fix metadata extraction (#17520) +* [eporner] Extract JSON-LD (#17519) + + +version 2018.09.10 + +Core ++ [utils] Properly recognize AV1 codec (#17506) + +Extractors ++ [iprima] Add support for prima.iprima.cz (#17514) ++ [tele5] Add support for tele5.de (#7805, #7922, #17331, #17414) +* [nbc] Fix extraction of percent encoded URLs (#17374) + + +version 2018.09.08 + +Extractors +* [youtube] Fix extraction (#17457, #17464) ++ [pornhub:uservideos] Add support for new URLs (#17388) +* [iprima] Confirm adult check (#17437) +* [slideslive] Make check for video service name case-insensitive (#17429) +* [radiojavan] Fix extraction (#17151) +* [generic] Skip unsuccessful jwplayer extraction (#16735) + + +version 2018.09.01 + +Core +* [utils] Skip remote IP addresses non matching to source address' IP version + when creating a connection (#13422, #17362) + +Extractors ++ [ard] Add support for one.ard.de (#17397) +* [niconico] Fix extraction on python3 (#17393, #17407) +* [ard] Extract f4m formats +* [crunchyroll] Parse vilos media data (#17343) ++ [ard] Add support for Beta ARD Mediathek ++ [bandcamp] Extract more metadata (#13197) +* [internazionale] Fix extraction of non-available-abroad videos (#17386) + + +version 2018.08.28 + +Extractors ++ [youtube:playlist] Add support for music album playlists (OLAK5uy_ prefix) + (#17361) +* [bitchute] Fix extraction by pass custom User-Agent (#17360) +* [webofstories:playlist] Fix extraction (#16914) ++ [tvplayhome] Add support for new tvplay URLs (#17344) ++ [generic] Allow relative src for videojs embeds (#17324) ++ [xfileshare] Add support for vidto.se (#17317) ++ [vidzi] Add support for vidzi.nu (#17316) ++ [nova:embed] Add support for media.cms.nova.cz (#17282) + + +version 2018.08.22 + +Core +* [utils] Use pure browser header for User-Agent (#17236) + +Extractors ++ [kinopoisk] Add support for kinopoisk.ru (#17283) ++ [yourporn] Add support for yourporn.sexy (#17298) ++ [go] Add support for disneynow.go.com (#16299, #17264) ++ [6play] Add support for play.rtl.hr (#17249) +* [anvato] Fallback to generic API key for access-key-to-API-key lookup + (#16788, #17254) +* [lci] Fix extraction (#17274) +* [bbccouk] Extend id URL regular expression (#17270) +* [cwtv] Fix extraction (#17256) +* [nova] Fix extraction (#17241) ++ [generic] Add support for expressen embeds +* [raywenderlich] Adapt to site redesign (#17225) ++ [redbulltv] Add support redbull.com tv URLs (#17218) ++ [bitchute] Add support for bitchute.com (#14052) ++ [clyp] Add support for token protected media (#17184) +* [imdb] Fix extension extraction (#17167) + + +version 2018.08.04 + +Extractors +* [funk:channel] Improve byChannelAlias extraction (#17142) +* [twitch] Fix authentication (#17024, #17126) +* [twitch:vod] Improve URL regular expression (#17135) +* [watchbox] Fix extraction (#17107) +* [pbs] Fix extraction (#17109) +* [theplatform] Relax URL regular expression (#16181, #17097) ++ [viqeo] Add support for viqeo.tv (#17066) + + +version 2018.07.29 + +Extractors +* [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076) ++ [pornhub] Add support for subtitles (#16924, #17088) +* [ceskatelevize] Use https for API call (#16997, #16999) +* [dailymotion:playlist] Fix extraction (#16894) +* [ted] Improve extraction +* [ted] Fix extraction for videos without nativeDownloads (#16756, #17085) +* [telecinco] Fix extraction (#17080) +* [mitele] Reduce number of requests +* [rai] Return non HTTP relinker URL intact (#17055) +* [vk] Fix extraction for inline only videos (#16923) +* [streamcloud] Fix extraction (#17054) +* [facebook] Fix tahoe player extraction with authentication (#16655) ++ [puhutv] Add support for puhutv.com (#12712, #16010, #16269) + + +version 2018.07.21 + +Core ++ [utils] Introduce url_or_none +* [utils] Allow JSONP without function name (#17028) ++ [extractor/common] Extract DASH and MSS formats from SMIL manifests + +Extractors ++ [bbc] Add support for BBC Radio Play pages (#17022) +* [iwara] Fix download URLs (#17026) +* [vrtnu] Relax title extraction and extract JSON-LD (#17018) ++ [viu] Pass Referer and Origin headers and area id (#16992) ++ [vimeo] Add another config regular expression (#17013) ++ [facebook] Extract view count (#16942) +* [dailymotion] Improve description extraction (#16984) +* [slutload] Fix and improve extraction (#17001) +* [mediaset] Fix extraction (#16977) ++ [theplatform] Add support for theplatform TLD customization (#16977) +* [imgur] Relax URL regular expression (#16987) +* [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262, + #16959) + + +version 2018.07.10 + +Core +* [utils] Share JSON-LD regular expression +* [downloader/dash] Improve error handling (#16927) + +Extractors ++ [nrktv] Add support for new season and serie URL schema ++ [nrktv] Add support for new episode URL schema (#16909) ++ [frontendmasters] Add support for frontendmasters.com (#3661, #16328) +* [funk] Fix extraction (#16918) +* [watchbox] Fix extraction (#16904) +* [dplayit] Sort formats +* [dplayit] Fix extraction (#16901) +* [youtube] Improve login error handling (#13822) + + +version 2018.07.04 + +Core +* [extractor/common] Properly escape % in MPD templates (#16867) +* [extractor/common] Use source URL as Referer for HTML5 entries (16849) +* Prefer ffmpeg over avconv by default (#8622) + +Extractors +* [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899) +* [lynda] Simplify login and improve error capturing (#16891) ++ [go90] Add support for embed URLs (#16873) +* [go90] Detect geo restriction error and pass geo verification headers + (#16874) +* [vlive] Fix live streams extraction (#16871) +* [npo] Fix typo (#16872) ++ [mediaset] Add support for new videos and extract all formats (#16568) +* [dctptv] Restore extraction based on REST API (#16850) +* [svt] Improve extraction and add support for pages (#16802) +* [porncom] Fix extraction (#16808) + + +version 2018.06.25 + +Extractors +* [joj] Relax URL regular expression (#16771) +* [brightcove] Workaround sonyliv DRM protected videos (#16807) +* [motherless] Fix extraction (#16786) +* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780) +- [foxnews:insider] Remove extractor (#15810) ++ [foxnews] Add support for iframe embeds (#15810, #16711) + + +version 2018.06.19 + +Core ++ [extractor/common] Introduce expected_status in _download_* methods + for convenient accept of HTTP requests failed with non 2xx status codes ++ [compat] Introduce compat_integer_types + +Extractors +* [peertube] Improve generic support (#16733) ++ [6play] Use geo verification headers +* [rtbf] Fix extraction for python 3.2 +* [vgtv] Improve HLS formats extraction ++ [vgtv] Add support for www.aftonbladet.se/tv URLs +* [bbccouk] Use expected_status +* [markiza] Expect 500 HTTP status code +* [tvnow] Try all clear manifest URLs (#15361) + + +version 2018.06.18 + +Core +* [downloader/rtmp] Fix downloading in verbose mode (#16736) + +Extractors ++ [markiza] Add support for markiza.sk (#16750) +* [wat] Try all supported adaptive URLs ++ [6play] Add support for rtlplay.be and extract hd usp formats ++ [rtbf] Add support for audio and live streams (#9638, #11923) ++ [rtbf] Extract HLS, DASH and all HTTP formats ++ [rtbf] Extract subtitles ++ [rtbf] Fixup specific HTTP URLs (#16101) ++ [expressen] Add support for expressen.se +* [vidzi] Fix extraction (#16678) +* [pbs] Improve extraction (#16623, #16684) +* [bilibili] Restrict cid regular expression (#16638, #16734) + + +version 2018.06.14 + +Core +* [downloader/http] Fix retry on error when streaming to stdout (#16699) + +Extractors ++ [discoverynetworks] Add support for disco-api videos (#16724) ++ [dailymotion] Add support for password protected videos (#9789) ++ [abc:iview] Add support for livestreams (#12354) +* [abc:iview] Fix extraction (#16704) ++ [crackle] Add support for sonycrackle.com (#16698) ++ [tvnet] Add support for tvnet.gov.vn (#15462) +* [nrk] Update API hosts and try all previously known ones (#16690) +* [wimp] Fix Youtube embeds extraction + + version 2018.06.11 Extractors diff --git a/README.md b/README.md index 499a0c206..c1572f771 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ youtube-dl - download videos from youtube.com or other video platforms # INSTALLATION -To install it right away for all UNIX users (Linux, OS X, etc.), type: +To install it right away for all UNIX users (Linux, macOS, etc.), type: sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl @@ -35,7 +35,7 @@ You can also use pip: This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information. -OS X users can install youtube-dl with [Homebrew](https://brew.sh/): +macOS users can install youtube-dl with [Homebrew](https://brew.sh/): brew install youtube-dl @@ -427,9 +427,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo default; fix file if we can, warn otherwise) --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors (default) - --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors + --prefer-ffmpeg Prefer ffmpeg over avconv for running the + postprocessors (default) --ffmpeg-location PATH Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. @@ -442,7 +442,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo # CONFIGURATION -You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and macOS, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` @@ -496,7 +496,7 @@ The `-o` option allows users to indicate a template for the output file names. **tl;dr:** [navigate me to examples](#output-template-examples). -The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are: +The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by formatting operations. Allowed names along with sequence type are: - `id` (string): Video identifier - `title` (string): Video title @@ -511,6 +511,8 @@ The basic usage is not to set any template arguments when downloading a single f - `timestamp` (numeric): UNIX timestamp of the moment the video became available - `upload_date` (string): Video upload date (YYYYMMDD) - `uploader_id` (string): Nickname or id of the video uploader + - `channel` (string): Full name of the channel the video is uploaded on + - `channel_id` (string): Id of the channel - `location` (string): Physical location where the video was filmed - `duration` (numeric): Length of the video in seconds - `view_count` (numeric): How many users have watched the video on the platform @@ -665,7 +667,7 @@ The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, ` - `asr`: Audio sampling rate in Hertz - `fps`: Frame rate -Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begins with), `$=` (ends with), `*=` (contains) and following string meta fields: +Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends with), `*=` (contains) and following string meta fields: - `ext`: File extension - `acodec`: Name of the audio codec in use - `vcodec`: Name of the video codec in use @@ -673,6 +675,8 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format +Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). + Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster. Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. @@ -870,7 +874,7 @@ Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the op Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. -In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox). Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. @@ -1022,16 +1026,20 @@ After you have ensured this site is distributing its content legally, you can fo ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart): + + $ flake8 youtube_dl/extractor/yourextractor.py + +9. Make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +10. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! @@ -1043,7 +1051,7 @@ Extractors are very fragile by nature since they depend on the layout of the sou ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) @@ -1051,7 +1059,7 @@ For extraction to work youtube-dl relies on metadata your extractor extracts and In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example @@ -1127,11 +1135,33 @@ title = meta.get('title') or self._og_search_title(webpage) This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`. -### Make regular expressions flexible +### Regular expressions -When using regular expressions try to write them fuzzy and flexible. +#### Don't capture groups you don't use + +Capturing group must be an indication that it's used somewhere in the code. Any group that is not used must be non capturing. + +##### Example + +Don't capture id attribute name here since you can't use it for anything anyway. + +Correct: + +```python +r'(?:id|ID)=(?P\d+)' +``` + +Incorrect: +```python +r'(id|ID)=(?P\d+)' +``` + + +#### Make regular expressions relaxed and flexible + +When using regular expressions try to write them fuzzy, relaxed and flexible, skipping insignificant parts that are more likely to change, allowing both single and double quotes for quoted values and so on. -#### Example +##### Example Say you need to extract `title` from the following HTML code: @@ -1164,9 +1194,51 @@ title = self._search_regex( webpage, 'title', group='title') ``` -### Use safe conversion functions +### Long lines policy -Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +There is a soft limit to keep lines of code under 80 characters long. This means it should be respected if possible and if it does not make readability and code maintenance worse. + +For example, you should **never** split long string literals like URLs or some other often copied entities over multiple lines to fit this limit: + +Correct: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + +Incorrect: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=' +'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + +### Use convenience conversion and parsing functions + +Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + +Use `url_or_none` for safe URL processing. + +Use `try_get` for safe metadata extraction from parsed JSON. + +Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. + +Explore [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. + +#### More examples + +##### Safely extract optional description from parsed JSON +```python +description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str) +``` + +##### Safely extract more optional metadata +```python +video = try_get(response, lambda x: x['result']['video'][0], dict) or {} +description = video.get('summary') +duration = float_or_none(video.get('durationMs'), scale=1000) +view_count = int_or_none(video.get('views')) +``` # EMBEDDING YOUTUBE-DL diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e1a9f2236..d8a8d7ede 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -33,7 +33,7 @@ - **AdobeTVShow** - **AdobeTVVideo** - **AdultSwim** - - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network + - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault - **afreecatv**: afreecatv.com - **AirMozilla** - **AliExpressLive** @@ -56,6 +56,7 @@ - **archive.org**: archive.org videos - **ARD** - **ARD:mediathek** + - **ARDBetaMediathek** - **Arkena** - **arte.tv** - **arte.tv:+7** @@ -83,8 +84,6 @@ - **awaan:season** - **awaan:video** - **AZMedien**: AZ Medien videos - - **AZMedienPlaylist**: AZ Medien playlists - - **AZMedienShowPlaylist**: AZ Medien show playlists - **BaiduVideo**: 百度视频 - **bambuser** - **bambuser:channel** @@ -97,6 +96,7 @@ - **bbc.co.uk:article**: BBC articles - **bbc.co.uk:iplayer:playlist** - **bbc.co.uk:playlist** + - **BBVTV** - **Beatport** - **Beeg** - **BehindKink** @@ -108,6 +108,8 @@ - **BiliBili** - **BioBioChileTV** - **BIQLE** + - **BitChute** + - **BitChuteChannel** - **BleacherReport** - **BleacherReportCMS** - **blinkx** @@ -161,6 +163,8 @@ - **chirbit** - **chirbit:profile** - **Cinchcast** + - **CiscoLiveSearch** + - **CiscoLiveSession** - **CJSW** - **cliphunter** - **Clippit** @@ -174,6 +178,7 @@ - **Clyp** - **cmt.com** - **CNBC** + - **CNBCVideo** - **CNN** - **CNNArticle** - **CNNBlogs** @@ -189,7 +194,7 @@ - **Crackle** - **Criterion** - **CrooksAndLiars** - - **Crunchyroll** + - **crunchyroll** - **crunchyroll:playlist** - **CSNNE** - **CSpan**: C-SPAN @@ -247,7 +252,9 @@ - **EchoMsk** - **egghead:course**: egghead.io course - **egghead:lesson**: egghead.io lesson + - **ehftv** - **eHow** + - **EinsUndEinsTV** - **Einthusan** - **eitb.tv** - **EllenTube** @@ -265,7 +272,9 @@ - **EsriVideo** - **Europa** - **EveryonesMixtape** + - **EWETV** - **ExpoTV** + - **Expressen** - **ExtremeTube** - **EyedoTV** - **facebook** @@ -289,7 +298,6 @@ - **Foxgay** - **foxnews**: Fox News and Fox Business Video - **foxnews:article** - - **foxnews:insider** - **FoxSports** - **france2.fr:generation-what** - **FranceCulture** @@ -302,6 +310,9 @@ - **Freesound** - **freespeech.org** - **FreshLive** + - **FrontendMasters** + - **FrontendMastersCourse** + - **FrontendMastersLesson** - **Funimation** - **FunkChannel** - **FunkMix** @@ -309,6 +320,7 @@ - **Fusion** - **Fux** - **FXNetworks** + - **Gaia** - **GameInformer** - **GameOne** - **gameone:playlist** @@ -321,6 +333,7 @@ - **Gfycat** - **GiantBomb** - **Giga** + - **GlattvisionTV** - **Glide**: Glide mobile video messages (glide.me) - **Globo** - **GloboArticle** @@ -348,9 +361,10 @@ - **hitbox** - **hitbox:live** - **HitRecord** + - **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau - **HornBunny** - **HotNewHipHop** - - **HotStar** + - **hotstar** - **hotstar:playlist** - **Howcast** - **HowStuffWorks** @@ -358,18 +372,22 @@ - **HRTiPlaylist** - **Huajiao**: 花椒直播 - **HuffPost**: Huffington Post + - **Hungama** + - **HungamaSong** - **Hypem** - **Iconosquare** - **ign.com** - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists - **Imgur** - - **ImgurAlbum** + - **imgur:album** + - **imgur:gallery** - **Ina** - **Inc** - **IndavideoEmbed** - **InfoQ** - **Instagram** + - **instagram:tag**: Instagram hashtag search - **instagram:user**: Instagram user profile - **Internazionale** - **InternetVideoArchive** @@ -402,6 +420,7 @@ - **Ketnet** - **KhanAcademy** - **KickStarter** + - **KinoPoisk** - **KonserthusetPlay** - **kontrtube**: KontrTube.ru - Труба зовёт - **KrasView**: Красвью @@ -422,6 +441,9 @@ - **Le**: 乐视网 - **Learnr** - **Lecture2Go** + - **Lecturio** + - **LecturioCourse** + - **LecturioDeCourse** - **LEGO** - **Lemonde** - **Lenta** @@ -434,6 +456,9 @@ - **limelight:channel** - **limelight:channel_list** - **LineTV** + - **linkedin:learning** + - **linkedin:learning:course** + - **LinuxAcademy** - **LiTV** - **LiveLeak** - **LiveLeakEmbed** @@ -452,9 +477,12 @@ - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru - **MakerTV** + - **MallTV** - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **Markiza** + - **MarkizaPage** - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA @@ -485,6 +513,7 @@ - **Mixer:vod** - **MLB** - **Mnet** + - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - **Mojvideo** @@ -516,10 +545,10 @@ - **Myvi** - **MyVidster** - **MyviEmbed** + - **MyVisionTV** - **n-tv.de** - - **natgeo** - - **natgeo:episodeguide** - **natgeo:video** + - **NationalGeographicTV** - **Naver** - **NBA** - **NBC** @@ -541,6 +570,7 @@ - **netease:program**: 网易云音乐 - 电台节目 - **netease:singer**: 网易云音乐 - 歌手 - **netease:song**: 网易云音乐 + - **NetPlus** - **Netzkino** - **Newgrounds** - **NewgroundsPlaylist** @@ -572,6 +602,7 @@ - **Normalboots** - **NosVideo** - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz + - **NovaEmbed** - **nowness** - **nowness:playlist** - **nowness:series** @@ -587,7 +618,9 @@ - **NRKSkole**: NRK Skole - **NRKTV**: NRK TV and NRK Radio - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte + - **NRKTVEpisode** - **NRKTVEpisodes** + - **NRKTVSeason** - **NRKTVSeries** - **ntv.ru** - **Nuvid** @@ -614,6 +647,8 @@ - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek + - **OsnatelTV** + - **OutsideTV** - **PacktPub** - **PacktPubCourse** - **PandaTV**: 熊猫TV @@ -638,6 +673,7 @@ - **Pinkbike** - **Pladform** - **play.fm** + - **PlayPlusTV** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** @@ -665,6 +701,8 @@ - **PrimeShareTV** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital + - **puhutv** + - **puhutv:serie** - **Puls4** - **Pyvideo** - **qqmusic**: QQ音乐 @@ -672,6 +710,7 @@ - **qqmusic:playlist**: QQ音乐 - 歌单 - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 + - **QuantumTV** - **Quickline** - **QuicklineLive** - **R7** @@ -687,6 +726,7 @@ - **RaiPlayLive** - **RaiPlayPlaylist** - **RayWenderlich** + - **RayWenderlichCourse** - **RBMARadio** - **RDS**: RDS.ca - **RedBullTV** @@ -738,6 +778,8 @@ - **safari**: safaribooksonline.com online video - **safari:api** - **safari:course**: safaribooksonline.com online courses + - **SAKTV** + - **SaltTV** - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -787,13 +829,14 @@ - **southpark.nl** - **southparkstudios.dk** - **SpankBang** + - **SpankBangPlaylist** - **Spankwire** - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de - **Spiegeltv** - **sport.francetvinfo.fr** - **Sport5** - - **SportBoxEmbed** + - **SportBox** - **SportDeutschland** - **SpringboardPlatform** - **Sprout** @@ -811,6 +854,7 @@ - **StretchInternet** - **SunPorno** - **SVT** + - **SVTPage** - **SVTPlay**: SVT Play and Öppet arkiv - **SVTSeries** - **SWRMediathek** @@ -823,6 +867,8 @@ - **TastyTrade** - **TBS** - **TDSLifeway** + - **Teachable** + - **TeachableCourse** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos - **TeachingChannel** @@ -831,6 +877,7 @@ - **techtv.mit.edu** - **ted** - **Tele13** + - **Tele5** - **TeleBruxelles** - **Telecinco**: telecinco.es, cuatro.com and mediaset.es - **Telegraaf** @@ -854,6 +901,8 @@ - **ThisAmericanLife** - **ThisAV** - **ThisOldHouse** + - **TikTok** + - **TikTokUser** - **tinypic**: tinypic.com videos - **TMZ** - **TMZArticle** @@ -867,6 +916,7 @@ - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** + - **TruNews** - **TruTV** - **Tube8** - **TubiTv** @@ -882,7 +932,6 @@ - **TV2** - **tv2.hu** - **TV2Article** - - **TV3** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ - **TVA** @@ -893,15 +942,20 @@ - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** + - **TVNet** - **TVNoe** - **TVNow** - - **TVNowList** + - **TVNowAnnual** + - **TVNowNew** + - **TVNowSeason** - **TVNowShow** - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** - **TVPlayer** + - **TVPlayHome** - **Tweakers** + - **TwitCasting** - **twitch:chapter** - **twitch:clips** - **twitch:profile** @@ -926,8 +980,6 @@ - **uol.com.br** - **uplynk** - **uplynk:preplay** - - **Upskill** - - **UpskillCourse** - **Urort**: NRK P3 Urørt - **URPlay** - **USANetwork** @@ -946,6 +998,7 @@ - **VevoPlaylist** - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** + - **vhx:embed** - **Viafree** - **vice** - **vice:article** @@ -990,6 +1043,7 @@ - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** + - **Viqeo** - **Viu** - **viu:ott** - **viu:playlist** @@ -1007,7 +1061,6 @@ - **Voot** - **VoxMedia** - **VoxMediaVolume** - - **Vporn** - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Vrak** - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be @@ -1015,12 +1068,15 @@ - **vrv** - **vrv:series** - **VShare** + - **VTXTV** - **vube**: Vube.com - **VuClip** - **VVVVID** - **VyboryMos** - **Vzaar** + - **Wakanim** - **Walla** + - **WalyTV** - **washingtonpost** - **washingtonpost:article** - **wat.tv** @@ -1046,6 +1102,7 @@ - **wrzuta.pl:playlist** - **WSJ**: Wall Street Journal - **WSJArticle** + - **WWE** - **XBef** - **XboxClips** - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me @@ -1081,6 +1138,7 @@ - **YouNowLive** - **YouNowMoment** - **YouPorn** + - **YourPorn** - **YourUpload** - **youtube**: YouTube.com - **youtube:channel**: YouTube.com channels @@ -1104,3 +1162,4 @@ - **ZDF** - **ZDFChannel** - **zingmp3**: mp3.zing.vn + - **Zype** diff --git a/setup.py b/setup.py index 7dbb5805f..dfb669ad2 100644 --- a/setup.py +++ b/setup.py @@ -124,6 +124,8 @@ setup( 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'License :: Public Domain', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', @@ -132,6 +134,13 @@ setup( 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: Implementation', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: IronPython', + 'Programming Language :: Python :: Implementation :: Jython', + 'Programming Language :: Python :: Implementation :: PyPy', ], cmdclass={'build_lazy_extractors': build_lazy_extractors}, diff --git a/test/helper.py b/test/helper.py index dfee217a9..e62aab11e 100644 --- a/test/helper.py +++ b/test/helper.py @@ -7,6 +7,7 @@ import json import os.path import re import types +import ssl import sys import youtube_dl.extractor @@ -152,15 +153,27 @@ def expect_value(self, got, expected, field): isinstance(got, compat_str), 'Expected field %s to be a unicode object, but got value %r of type %r' % (field, got, type(got))) got = 'md5:' + md5(got) - elif isinstance(expected, compat_str) and expected.startswith('mincount:'): + elif isinstance(expected, compat_str) and re.match(r'^(?:min|max)?count:\d+', expected): self.assertTrue( isinstance(got, (list, dict)), 'Expected field %s to be a list or a dict, but it is of type %s' % ( field, type(got).__name__)) - expected_num = int(expected.partition(':')[2]) - assertGreaterEqual( + op, _, expected_num = expected.partition(':') + expected_num = int(expected_num) + if op == 'mincount': + assert_func = assertGreaterEqual + msg_tmpl = 'Expected %d items in field %s, but only got %d' + elif op == 'maxcount': + assert_func = assertLessEqual + msg_tmpl = 'Expected maximum %d items in field %s, but got %d' + elif op == 'count': + assert_func = assertEqual + msg_tmpl = 'Expected exactly %d items in field %s, but got %d' + else: + assert False + assert_func( self, len(got), expected_num, - 'Expected %d items in field %s, but only got %d' % (expected_num, field, len(got))) + msg_tmpl % (expected_num, field, len(got))) return self.assertEqual( expected, got, @@ -236,6 +249,20 @@ def assertGreaterEqual(self, got, expected, msg=None): self.assertTrue(got >= expected, msg) +def assertLessEqual(self, got, expected, msg=None): + if not (got <= expected): + if msg is None: + msg = '%r not less than or equal to %r' % (got, expected) + self.assertTrue(got <= expected, msg) + + +def assertEqual(self, got, expected, msg=None): + if not (got == expected): + if msg is None: + msg = '%r not equal to %r' % (got, expected) + self.assertTrue(got == expected, msg) + + def expect_warnings(ydl, warnings_re): real_warning = ydl.report_warning @@ -244,3 +271,12 @@ def expect_warnings(ydl, warnings_re): real_warning(w) ydl.report_warning = _report_warning + + +def http_server_port(httpd): + if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): + # In Jython SSLSocket is not a subclass of socket.socket + sock = httpd.socket.sock + else: + sock = httpd.socket + return sock.getsockname()[1] diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 4833396a5..f0aa8466b 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -9,11 +9,30 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, expect_dict, expect_value -from youtube_dl.compat import compat_etree_fromstring +from test.helper import FakeYDL, expect_dict, expect_value, http_server_port +from youtube_dl.compat import compat_etree_fromstring, compat_http_server from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError +import threading + + +TEAPOT_RESPONSE_STATUS = 418 +TEAPOT_RESPONSE_BODY = "

418 I'm a teapot

" + + +class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def do_GET(self): + if self.path == '/teapot': + self.send_response(TEAPOT_RESPONSE_STATUS) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.end_headers() + self.wfile.write(TEAPOT_RESPONSE_BODY.encode()) + else: + assert False class TestIE(InfoExtractor): @@ -42,6 +61,7 @@ class TestInfoExtractor(unittest.TestCase): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') @@ -50,6 +70,7 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._og_search_property('foobar', html), 'Foo') self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') + self.assertEqual(ie._og_search_property('test3', html), 'Ill-formatted opengraph') self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar') self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True) @@ -478,7 +499,64 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'width': 1280, 'height': 720, }] - ) + ), + ( + # https://github.com/rg3/youtube-dl/issues/18923 + # https://www.ted.com/talks/boris_hesser_a_grassroots_healthcare_revolution_in_africa + 'ted_18923', + 'http://hls.ted.com/talks/31241.m3u8', + [{ + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '600k-Audio', + 'vcodec': 'none', + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '68', + 'vcodec': 'none', + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '163', + 'acodec': 'none', + 'width': 320, + 'height': 180, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '481', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '769', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '984', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '1255', + 'acodec': 'none', + 'width': 640, + 'height': 360, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '1693', + 'acodec': 'none', + 'width': 853, + 'height': 480, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '2462', + 'acodec': 'none', + 'width': 1280, + 'height': 720, + }] + ), ] for m3u8_file, m3u8_url, expected_formats in _TEST_CASES: @@ -743,6 +821,25 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ for i in range(len(entries)): expect_dict(self, entries[i], expected_entries[i]) + def test_response_with_expected_status_returns_content(self): + # Checks for mitigations against the effects of + # that affect Python 3.4.1+, which + # manifest as `_download_webpage`, `_download_xml`, `_download_json`, + # or the underlying `_download_webpage_handle` returning no content + # when a response matches `expected_status`. + + httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), InfoExtractorTestRequestHandler) + port = http_server_port(httpd) + server_thread = threading.Thread(target=httpd.serve_forever) + server_thread.daemon = True + server_thread.start() + + (content, urlh) = self.ie._download_webpage_handle( + 'http://127.0.0.1:%d/teapot' % port, None, + expected_status=TEAPOT_RESPONSE_STATUS) + self.assertEqual(content, TEAPOT_RESPONSE_BODY) + if __name__ == '__main__': unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f0f5a8470..1d7452744 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -239,6 +239,76 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot') + def test_format_selection_string_ops(self): + formats = [ + {'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL}, + {'format_id': 'zxc-cxz', 'ext': 'webm', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + # equals (=) + ydl = YDL({'format': '[format_id=abc-cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not equal (!=) + ydl = YDL({'format': '[format_id!=abc-cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!=abc-cba][format_id!=zxc-cxz]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # starts with (^=) + ydl = YDL({'format': '[format_id^=abc]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not start with (!^=) + ydl = YDL({'format': '[format_id!^=abc]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!^=abc][format_id!^=zxc]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # ends with ($=) + ydl = YDL({'format': '[format_id$=cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not end with (!$=) + ydl = YDL({'format': '[format_id!$=cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!$=cba][format_id!$=cxz]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # contains (*=) + ydl = YDL({'format': '[format_id*=bc-cb]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not contain (!*=) + ydl = YDL({'format': '[format_id!*=bc-cb]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!*=abc][format_id!*=zxc]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + ydl = YDL({'format': '[format_id!*=-]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + def test_youtube_format_selection(self): order = [ '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13', diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py new file mode 100644 index 000000000..6a8243590 --- /dev/null +++ b/test/test_YoutubeDLCookieJar.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# coding: utf-8 + +from __future__ import unicode_literals + +import os +import re +import sys +import tempfile +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import YoutubeDLCookieJar + + +class TestYoutubeDLCookieJar(unittest.TestCase): + def test_keep_session_cookies(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + tf = tempfile.NamedTemporaryFile(delete=False) + try: + cookiejar.save(filename=tf.name, ignore_discard=True, ignore_expires=True) + temp = tf.read().decode('utf-8') + self.assertTrue(re.search( + r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpiresEmpty\s+YoutubeDLExpiresEmptyValue', temp)) + self.assertTrue(re.search( + r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpires0\s+YoutubeDLExpires0Value', temp)) + finally: + tf.close() + os.remove(tf.name) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_compat.py b/test/test_compat.py index d6c54e135..51fe6aa0b 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -39,7 +39,7 @@ class TestCompat(unittest.TestCase): def test_compat_expanduser(self): old_home = os.environ.get('HOME') - test_str = 'C:\Documents and Settings\тест\Application Data' + test_str = r'C:\Documents and Settings\тест\Application Data' compat_setenv('HOME', test_str) self.assertEqual(compat_expanduser('~'), test_str) compat_setenv('HOME', old_home or '') diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 5cf2bf1a5..750472281 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -9,26 +9,16 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import try_rm +from test.helper import http_server_port, try_rm from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server from youtube_dl.downloader.http import HttpFD from youtube_dl.utils import encodeFilename -import ssl import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def http_server_port(httpd): - if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): - # In Jython SSLSocket is not a subclass of socket.socket - sock = httpd.socket.sock - else: - sock = httpd.socket - return sock.getsockname()[1] - - TEST_SIZE = 10 * 1024 diff --git a/test/test_http.py b/test/test_http.py index 409fec9c8..3ee0a5dda 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,6 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import http_server_port from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl @@ -16,15 +17,6 @@ import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def http_server_port(httpd): - if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): - # In Jython SSLSocket is not a subclass of socket.socket - sock = httpd.socket.sock - else: - sock = httpd.socket - return sock.getsockname()[1] - - class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): def log_message(self, format, *args): pass diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index addb69d6f..4209d1d9a 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -14,4 +14,4 @@ from youtube_dl.postprocessor import MetadataFromTitlePP class TestMetadataFromTitle(unittest.TestCase): def test_format_to_regex(self): pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') - self.assertEqual(pp._titleregex, '(?P.+)\ \-\ (?P<artist>.+)') + self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)') diff --git a/test/test_utils.py b/test/test_utils.py index e63af0166..409482c3b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -78,6 +78,7 @@ from youtube_dl.utils import ( uppercase_escape, lowercase_escape, url_basename, + url_or_none, base_url, urljoin, urlencode_postdata, @@ -506,6 +507,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(urljoin('http://foo.de/', ''), None) self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') + self.assertEqual(urljoin('http://foo.de/a/b/c.txt', 'rtmp://foo.de'), 'rtmp://foo.de') + self.assertEqual(urljoin(None, 'rtmp://foo.de'), 'rtmp://foo.de') + + def test_url_or_none(self): + self.assertEqual(url_or_none(None), None) + self.assertEqual(url_or_none(''), None) + self.assertEqual(url_or_none('foo'), None) + self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') + self.assertEqual(url_or_none('https://foo.de'), 'https://foo.de') + self.assertEqual(url_or_none('http$://foo.de'), None) + self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') + self.assertEqual(url_or_none('//foo.de'), '//foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) @@ -717,6 +730,10 @@ class TestUtil(unittest.TestCase): d = json.loads(stripped) self.assertEqual(d, {'status': 'success'}) + stripped = strip_jsonp('({"status": "success"});') + d = json.loads(stripped) + self.assertEqual(d, {'status': 'success'}) + def test_uppercase_escape(self): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') @@ -770,6 +787,10 @@ class TestUtil(unittest.TestCase): 'vcodec': 'h264', 'acodec': 'aac', }) + self.assertEqual(parse_codecs('av01.0.05M.08'), { + 'vcodec': 'av01.0.05M.08', + 'acodec': 'none', + }) def test_escape_rfc3986(self): reserved = "!*'();:@&=+$,/?#[]" diff --git a/test/testdata/cookies/session_cookies.txt b/test/testdata/cookies/session_cookies.txt new file mode 100644 index 000000000..f6996f031 --- /dev/null +++ b/test/testdata/cookies/session_cookies.txt @@ -0,0 +1,6 @@ +# Netscape HTTP Cookie File +# http://curl.haxx.se/rfc/cookie_spec.html +# This is a generated file! Do not edit. + +www.foobar.foobar FALSE / TRUE YoutubeDLExpiresEmpty YoutubeDLExpiresEmptyValue +www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpires0 YoutubeDLExpires0Value diff --git a/test/testdata/m3u8/ted_18923.m3u8 b/test/testdata/m3u8/ted_18923.m3u8 new file mode 100644 index 000000000..52a27118b --- /dev/null +++ b/test/testdata/m3u8/ted_18923.m3u8 @@ -0,0 +1,28 @@ +#EXTM3U +#EXT-X-VERSION:4 +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1255659,PROGRAM-ID=1,CODECS="avc1.42c01e,mp4a.40.2",RESOLUTION=640x360 +/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=163154,PROGRAM-ID=1,CODECS="avc1.42c00c,mp4a.40.2",RESOLUTION=320x180 +/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=481701,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=769968,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=984037,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1693925,PROGRAM-ID=1,CODECS="avc1.4d401f,mp4a.40.2",RESOLUTION=853x480 +/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=2462469,PROGRAM-ID=1,CODECS="avc1.640028,mp4a.40.2",RESOLUTION=1280x720 +/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=68101,PROGRAM-ID=1,CODECS="mp4a.40.2",DEFAULT=YES +/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b + +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=74298,PROGRAM-ID=1,CODECS="avc1.42c00c",RESOLUTION=320x180,URI="/videos/BorisHesser_2018S/video/64k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=216200,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/180k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=304717,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/320k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=350933,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/450k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=495850,PROGRAM-ID=1,CODECS="avc1.42c01e",RESOLUTION=640x360,URI="/videos/BorisHesser_2018S/video/600k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=810750,PROGRAM-ID=1,CODECS="avc1.4d401f",RESOLUTION=853x480,URI="/videos/BorisHesser_2018S/video/950k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=1273700,PROGRAM-ID=1,CODECS="avc1.640028",RESOLUTION=1280x720,URI="/videos/BorisHesser_2018S/video/1500k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" + +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="600k",LANGUAGE="en",NAME="Audio",AUTOSELECT=YES,DEFAULT=YES,URI="/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b",BANDWIDTH=614400 diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2a405c5ca..bc9fc270c 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -82,12 +82,14 @@ from .utils import ( sanitize_url, sanitized_Request, std_headers, + str_or_none, subtitles_filename, UnavailableVideoError, url_basename, version_tuple, write_json_file, write_string, + YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, ) @@ -305,8 +307,8 @@ class YoutubeDL(object): http_chunk_size. The following options are used by the post processors: - prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, - otherwise prefer avconv. + prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. postprocessor_args: A list of additional command-line arguments for the postprocessor. @@ -558,7 +560,7 @@ class YoutubeDL(object): self.restore_console_title() if self.params.get('cookiefile') is not None: - self.cookiejar.save() + self.cookiejar.save(ignore_discard=True, ignore_expires=True) def trouble(self, message=None, tb=None): """Determine action to take when a download problem appears. @@ -1062,21 +1064,24 @@ class YoutubeDL(object): if not m: STR_OPERATORS = { '=': operator.eq, - '!=': operator.ne, '^=': lambda attr, value: attr.startswith(value), '$=': lambda attr, value: attr.endswith(value), '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id) - \s*(?P<op>%s)(?P<none_inclusive>\s*\?)? + \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<value>[a-zA-Z0-9._-]+) \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') - op = STR_OPERATORS[m.group('op')] + str_op = STR_OPERATORS[m.group('op')] + if m.group('negation'): + op = lambda attr, value: not str_op(attr, value) + else: + op = str_op if not m: raise ValueError('Invalid filter specification %r' % filter_spec) @@ -2056,15 +2061,24 @@ class YoutubeDL(object): self.report_warning('Unable to remove downloaded original file') def _make_archive_id(self, info_dict): + video_id = info_dict.get('id') + if not video_id: + return # Future-proof against any change in case # and backwards compatibility with prior versions - extractor = info_dict.get('extractor_key') + extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist if extractor is None: - if 'id' in info_dict: - extractor = info_dict.get('ie_key') # key in a playlist - if extractor is None: - return None # Incomplete video information - return extractor.lower() + ' ' + info_dict['id'] + url = str_or_none(info_dict.get('url')) + if not url: + return + # Try to find matching extractor for the URL and take its ie_key + for ie in self._ies: + if ie.suitable(url): + extractor = ie.ie_key() + break + else: + return + return extractor.lower() + ' ' + video_id def in_download_archive(self, info_dict): fn = self.params.get('download_archive') @@ -2072,7 +2086,7 @@ class YoutubeDL(object): return False vid_id = self._make_archive_id(info_dict) - if vid_id is None: + if not vid_id: return False # Incomplete video information try: @@ -2297,10 +2311,9 @@ class YoutubeDL(object): self.cookiejar = compat_cookiejar.CookieJar() else: opts_cookiefile = expand_path(opts_cookiefile) - self.cookiejar = compat_cookiejar.MozillaCookieJar( - opts_cookiefile) + self.cookiejar = YoutubeDLCookieJar(opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): - self.cookiejar.load() + self.cookiejar.load(ignore_discard=True, ignore_expires=True) cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 4a611f183..7b770340f 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2787,6 +2787,12 @@ except NameError: # Python 3 compat_numeric_types = (int, float, complex) +try: + compat_integer_types = (int, long) +except NameError: # Python 3 + compat_integer_types = (int, ) + + if sys.version_info < (2, 7): def compat_socket_create_connection(address, timeout, source_address=None): host, port = address @@ -2974,6 +2980,7 @@ __all__ = [ 'compat_http_client', 'compat_http_server', 'compat_input', + 'compat_integer_types', 'compat_itertools_count', 'compat_kwargs', 'compat_numeric_types', diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 576ece6db..eaa7adf7c 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .fragment import FragmentFD from ..compat import compat_urllib_error -from ..utils import urljoin +from ..utils import ( + DownloadError, + urljoin, +) class DashSegmentsFD(FragmentFD): @@ -57,6 +60,14 @@ class DashSegmentsFD(FragmentFD): count += 1 if count <= fragment_retries: self.report_retry_fragment(err, frag_index, count, fragment_retries) + except DownloadError: + # Don't retry fragment if error occurred during HTTP downloading + # itself since it has own retry settings + if not fatal: + self.report_skip_fragment(frag_index) + break + raise + if count > fragment_retries: if not fatal: self.report_skip_fragment(frag_index) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index fd304527e..4def8e2d5 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -75,10 +75,14 @@ class HlsFD(FragmentFD): fd.add_progress_hook(ph) return fd.real_download(filename, info_dict) - def is_ad_fragment(s): + def is_ad_fragment_start(s): return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) + def is_ad_fragment_end(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s or + s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment')) + media_frags = 0 ad_frags = 0 ad_frag_next = False @@ -87,12 +91,13 @@ class HlsFD(FragmentFD): if not line: continue if line.startswith('#'): - if is_ad_fragment(line): - ad_frags += 1 + if is_ad_fragment_start(line): ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False continue if ad_frag_next: - ad_frag_next = False + ad_frags += 1 continue media_frags += 1 @@ -123,7 +128,6 @@ class HlsFD(FragmentFD): if line: if not line.startswith('#'): if ad_frag_next: - ad_frag_next = False continue frag_index += 1 if frag_index <= ctx['fragment_index']: @@ -196,8 +200,10 @@ class HlsFD(FragmentFD): 'start': sub_range_start, 'end': sub_range_start + int(splitted_byte_range[0]), } - elif is_ad_fragment(line): + elif is_ad_fragment_start(line): ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 9e0ddbb18..fbb7f51b0 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -24,13 +24,12 @@ class RtmpFD(FileDownloader): def real_download(self, filename, info_dict): def run_rtmpdump(args): start = time.time() + resume_percent = None + resume_downloaded_data_len = None proc = subprocess.Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True - - def dl(): - resume_percent = None - resume_downloaded_data_len = None - proc_stderr_closed = False + proc_stderr_closed = False + try: while not proc_stderr_closed: # read line from stderr line = '' @@ -90,12 +89,8 @@ class RtmpFD(FileDownloader): self.to_screen('') cursor_in_new_line = True self.to_screen('[rtmpdump] ' + line) - - try: - dl() finally: proc.wait() - if not cursor_in_new_line: self.to_screen('') return proc.returncode diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 6d846ea7a..c4362be88 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -17,25 +17,15 @@ from ..utils import ( class ACastIE(InfoExtractor): IE_NAME = 'acast' - _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P<channel>[^/]+)/(?P<id>[^/#?]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:embed|www)\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P<channel>[^/]+)/(?P<id>[^/#?]+) + ''' _TESTS = [{ - # test with one bling - 'url': 'https://www.acast.com/condenasttraveler/-where-are-you-taipei-101-taiwan', - 'md5': 'ada3de5a1e3a2a381327d749854788bb', - 'info_dict': { - 'id': '57de3baa-4bb0-487e-9418-2692c1277a34', - 'ext': 'mp3', - 'title': '"Where Are You?": Taipei 101, Taiwan', - 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', - 'timestamp': 1196172000, - 'upload_date': '20071127', - 'duration': 211, - 'creator': 'Concierge', - 'series': 'Condé Nast Traveler Podcast', - 'episode': '"Where Are You?": Taipei 101, Taiwan', - } - }, { - # test with multiple blings 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', 'md5': 'a02393c74f3bdb1801c3ec2695577ce0', 'info_dict': { @@ -50,6 +40,12 @@ class ACastIE(InfoExtractor): 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', } + }, { + 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', + 'only_matching': True, + }, { + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'only_matching': True, }] def _real_extract(self, url): @@ -83,17 +79,27 @@ class ACastIE(InfoExtractor): class ACastChannelIE(InfoExtractor): IE_NAME = 'acast:channel' - _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P<id>[^/#?]+)' - _TEST = { - 'url': 'https://www.acast.com/condenasttraveler', + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P<id>[^/#?]+) + ''' + _TESTS = [{ + 'url': 'https://www.acast.com/todayinfocus', 'info_dict': { - 'id': '50544219-29bb-499e-a083-6087f4cb7797', - 'title': 'Condé Nast Traveler Podcast', - 'description': 'md5:98646dee22a5b386626ae31866638fbd', + 'id': '4efc5294-5385-4847-98bd-519799ce5786', + 'title': 'Today in Focus', + 'description': 'md5:9ba5564de5ce897faeb12963f4537a64', }, - 'playlist_mincount': 20, - } - _API_BASE_URL = 'https://www.acast.com/api/' + 'playlist_mincount': 35, + }, { + 'url': 'http://play.acast.com/s/ft-banking-weekly', + 'only_matching': True, + }] + _API_BASE_URL = 'https://play.acast.com/api/' _PAGE_SIZE = 10 @classmethod @@ -106,7 +112,7 @@ class ACastChannelIE(InfoExtractor): channel_slug, note='Download page %d of channel data' % page) for cast in casts: yield self.url_result( - 'https://www.acast.com/%s/%s' % (channel_slug, cast['url']), + 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']), 'ACast', cast['id']) def _real_extract(self, url): diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index b83b51efb..1cf2dcbf3 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -1325,8 +1325,8 @@ class AdobePassIE(InfoExtractor): _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' def _download_webpage_handle(self, *args, **kwargs): - headers = kwargs.get('headers', {}) - headers.update(self.geo_verification_headers()) + headers = self.geo_verification_headers() + headers.update(kwargs.get('headers', {})) kwargs['headers'] = headers return super(AdobePassIE, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index acc4ce38d..88c96a950 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -7,6 +7,7 @@ from .turner import TurnerBaseIE from ..utils import ( int_or_none, strip_or_none, + url_or_none, ) @@ -98,7 +99,7 @@ class AdultSwimIE(TurnerBaseIE): if not video_id: entries = [] for episode in video_data.get('archiveEpisodes', []): - episode_url = episode.get('url') + episode_url = url_or_none(episode.get('url')) if not episode_url: continue entries.append(self.url_result( diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 398e56ea3..85ec6392d 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -22,18 +22,19 @@ class AENetworksBaseIE(ThePlatformIE): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' - IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' + IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' _VALID_URL = r'''(?x) https?:// (?:www\.)? (?P<domain> - (?:history|aetv|mylifetime|lifetimemovieclub)\.com| + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| fyi\.tv )/ (?: shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})| movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?| - specials/(?P<special_display_id>[^/]+)/full-special + specials/(?P<special_display_id>[^/]+)/full-special| + collections/[^/]+/(?P<collection_display_id>[^/]+) ) ''' _TESTS = [{ @@ -80,6 +81,9 @@ class AENetworksIE(AENetworksBaseIE): }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', + 'only_matching': True }] _DOMAIN_TO_REQUESTOR_ID = { 'history.com': 'HISTORY', @@ -90,9 +94,9 @@ class AENetworksIE(AENetworksBaseIE): } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id - webpage = self._download_webpage(url, display_id) + domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() + display_id = show_path or movie_display_id or special_display_id or collection_display_id + webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) if show_path: url_parts = show_path.split('/') url_parts_len = len(url_parts) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 4b3d97136..6275e5209 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + url_or_none, urlencode_postdata, xpath_text, ) @@ -304,7 +305,7 @@ class AfreecaTVIE(InfoExtractor): file_elements = video_element.findall(compat_xpath('./file')) one = len(file_elements) == 1 for file_num, file_element in enumerate(file_elements, start=1): - file_url = file_element.text + file_url = url_or_none(file_element.text) if not file_url: continue key = file_element.get('key', '') diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 01736872d..8b32aa886 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -43,10 +43,6 @@ class AmericasTestKitchenIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - partner_id = self._search_regex( - r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', - webpage, 'kaltura partner id') - video_data = self._parse_json( self._search_regex( r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*</script>', @@ -58,7 +54,18 @@ class AmericasTestKitchenIE(InfoExtractor): (lambda x: x['episodeDetail']['content']['data'], lambda x: x['videoDetail']['content']['data']), dict) ep_meta = ep_data.get('full_video', {}) - external_id = ep_data.get('external_id') or ep_meta['external_id'] + + zype_id = ep_meta.get('zype_id') + if zype_id: + embed_url = 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id + ie_key = 'Zype' + else: + partner_id = self._search_regex( + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id') + external_id = ep_data.get('external_id') or ep_meta['external_id'] + embed_url = 'kaltura:%s:%s' % (partner_id, external_id) + ie_key = 'Kaltura' title = ep_data.get('title') or ep_meta.get('title') description = clean_html(ep_meta.get('episode_description') or ep_data.get( @@ -72,8 +79,8 @@ class AmericasTestKitchenIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, external_id), - 'ie_key': 'Kaltura', + 'url': embed_url, + 'ie_key': ie_key, 'title': title, 'description': description, 'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index fde1a8ff7..7ff098cfa 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -3,11 +3,12 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - int_or_none, - parse_iso8601, - mimetype2ext, determine_ext, ExtractorError, + int_or_none, + mimetype2ext, + parse_iso8601, + url_or_none, ) @@ -35,7 +36,7 @@ class AMPIE(InfoExtractor): media_thumbnail = [media_thumbnail] for thumbnail_data in media_thumbnail: thumbnail = thumbnail_data.get('@attributes', {}) - thumbnail_url = thumbnail.get('url') + thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue thumbnails.append({ @@ -51,7 +52,7 @@ class AMPIE(InfoExtractor): media_subtitle = [media_subtitle] for subtitle_data in media_subtitle: subtitle = subtitle_data.get('@attributes', {}) - subtitle_href = subtitle.get('href') + subtitle_href = url_or_none(subtitle.get('href')) if not subtitle_href: continue subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ @@ -65,7 +66,7 @@ class AMPIE(InfoExtractor): media_content = [media_content] for media_data in media_content: media = media_data.get('@attributes', {}) - media_url = media.get('url') + media_url = url_or_none(media.get('url')) if not media_url: continue ext = mimetype2ext(media.get('type')) or determine_ext(media_url) @@ -79,7 +80,7 @@ class AMPIE(InfoExtractor): else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), - 'url': media['url'], + 'url': media_url, 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), 'ext': ext, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 1fe5d5e56..00ce684d1 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, extract_attributes, ExtractorError, + url_or_none, urlencode_postdata, urljoin, ) @@ -165,7 +166,7 @@ class AnimeOnDemandIE(InfoExtractor): }, fatal=False) if not playlist: continue - stream_url = playlist.get('streamurl') + stream_url = url_or_none(playlist.get('streamurl')) if stream_url: rtmp = re.search( r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index f6a78eb5d..84e841035 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -134,9 +134,33 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } + _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' + _TESTS = [{ + # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874 + 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', + 'info_dict': { + 'id': '4465496', + 'ext': 'mp4', + 'title': 'VIDEO: Humpback whale breaches right next to NH boat', + 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.', + 'duration': 22, + 'timestamp': 1534855680, + 'upload_date': '20180821', + 'uploader': 'ANV', + }, + 'params': { + 'skip_download': True, + }, + }, { + # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/ + 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601', + 'only_matching': True, + }] + def __init__(self, *args, **kwargs): super(AnvatoIE, self).__init__(*args, **kwargs) self.__server_time = None @@ -169,7 +193,8 @@ class AnvatoIE(InfoExtractor): 'api': { 'anvrid': anvrid, 'anvstk': md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, self._ANVACK_TABLE[access_key])), + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))), 'anvts': server_time, }, } @@ -284,5 +309,6 @@ class AnvatoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) access_key, video_id = mobj.group('access_key_or_mcp', 'id') if access_key not in self._ANVACK_TABLE: - access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key] + access_key = self._MCP_TO_ACCESS_KEY_TABLE.get( + access_key) or access_key return self._get_anvato_videos(access_key, video_id) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b50f454ee..cb9279193 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + url_or_none, ) @@ -77,7 +78,7 @@ class AolIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) for rendition in video_data.get('renditions', []): - video_url = rendition.get('url') + video_url = url_or_none(rendition.get('url')) if not video_url: continue ext = rendition.get('format') diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py index a30a935aa..98ccdaa4a 100644 --- a/youtube_dl/extractor/apa.py +++ b/youtube_dl/extractor/apa.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, js_to_json, + url_or_none, ) @@ -68,8 +68,8 @@ class APAIE(InfoExtractor): for source in sources: if not isinstance(source, dict): continue - source_url = source.get('file') - if not source_url or not isinstance(source_url, compat_str): + source_url = url_or_none(source.get('file')) + if not source_url: continue ext = determine_ext(source_url) if ext == 'm3u8': diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index e394cb661..883dcee7a 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -4,66 +4,92 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + merge_dicts, mimetype2ext, + url_or_none, ) class AparatIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.aparat.com/v/wP8On', 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', 'info_dict': { 'id': 'wP8On', 'ext': 'mp4', 'title': 'تیم گلکسی 11 - زومیت', - 'age_limit': 0, + 'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028', + 'duration': 231, + 'timestamp': 1387394859, + 'upload_date': '20131218', + 'view_count': int, }, - # 'skip': 'Extremely unreliable', - } + }, { + # multiple formats + 'url': 'https://www.aparat.com/v/8dflw/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work - webpage = self._download_webpage( - 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, - video_id) + # Provides more metadata + webpage = self._download_webpage(url, video_id, fatal=False) - title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title') + if not webpage: + # Note: There is an easier-to-parse configuration at + # http://www.aparat.com/video/video/config/videohash/%video_id + # but the URL in there does not work + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) - file_list = self._parse_json( + options = self._parse_json( self._search_regex( - r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, - 'file list'), + r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1\s*\)', + webpage, 'options', group='value'), video_id) + player = options['plugins']['sabaPlayerPlugin'] + formats = [] - for item in file_list[0]: - file_url = item.get('file') - if not file_url: - continue - ext = mimetype2ext(item.get('type')) - label = item.get('label') - formats.append({ - 'url': file_url, - 'ext': ext, - 'format_id': label or ext, - 'height': int_or_none(self._search_regex( - r'(\d+)[pP]', label or '', 'height', default=None)), - }) - self._sort_formats(formats) + for sources in player['multiSRC']: + for item in sources: + if not isinstance(item, dict): + continue + file_url = url_or_none(item.get('src')) + if not file_url: + continue + item_type = item.get('type') + if item_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': 'http-%s' % (label or ext), + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', + default=None)), + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) - thumbnail = self._search_regex( - r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) + info = self._search_json_ld(webpage, video_id, default={}) - return { + if not info.get('title'): + info['title'] = player['title'] + + return merge_dicts(info, { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'age_limit': self._family_friendly_search(webpage), + 'thumbnail': url_or_none(options.get('poster')), + 'duration': int_or_none(player.get('duration')), 'formats': formats, - } + }) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 86951d975..8adae4644 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -5,23 +5,26 @@ import re from .common import InfoExtractor from .generic import GenericIE -from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, - qualities, int_or_none, parse_duration, + qualities, + str_or_none, + try_get, unified_strdate, - xpath_text, + unified_timestamp, update_url_query, + url_or_none, + xpath_text, ) from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ # available till 26.07.2022 @@ -37,6 +40,9 @@ class ARDMediathekIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', + 'only_matching': True, }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', @@ -48,8 +54,15 @@ class ARDMediathekIE(InfoExtractor): # audio 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', 'only_matching': True, + }, { + 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) + def _extract_media_info(self, media_info_url, webpage, video_id): media_info = self._download_json( media_info_url, video_id, 'Downloading media JSON') @@ -100,7 +113,7 @@ class ARDMediathekIE(InfoExtractor): quality = stream.get('_quality') server = stream.get('_server') for stream_url in stream_urls: - if not isinstance(stream_url, compat_str) or '//' not in stream_url: + if not url_or_none(stream_url): continue ext = determine_ext(stream_url) if quality != 'auto' and ext in ('f4m', 'm3u8'): @@ -170,13 +183,18 @@ class ARDMediathekIE(InfoExtractor): title = self._html_search_regex( [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', r'<meta name="dcterms\.title" content="(.*?)"/>', - r'<h4 class="headline">(.*?)</h4>'], + r'<h4 class="headline">(.*?)</h4>', + r'<title[^>]*>(.*?)'], webpage, 'title') description = self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( - 'description', webpage, 'meta description') + 'description', webpage, 'meta description', default=None) + if description is None: + description = self._html_search_regex( + r'(.+?)

', + webpage, 'teaser text', default=None) # Thumbnail is sometimes not present. # It is in the mobile version, but that seems to use a different URL @@ -282,3 +300,101 @@ class ARDIE(InfoExtractor): 'upload_date': upload_date, 'thumbnail': thumbnail, } + + +class ARDBetaMediathekIE(InfoExtractor): + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' + _TESTS = [{ + 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', + 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', + 'info_dict': { + 'display_id': 'die-robuste-roswita', + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', + 'title': 'Tatort: Die robuste Roswita', + 'description': r're:^Der Mord.*trüber ist als die Ilm.', + 'duration': 5316, + 'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', + 'upload_date': '20180826', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') + data = self._parse_json(data_json, display_id) + + res = { + 'id': video_id, + 'display_id': display_id, + } + formats = [] + subtitles = {} + geoblocked = False + for widget in data.values(): + if widget.get('_geoblocked') is True: + geoblocked = True + if '_duration' in widget: + res['duration'] = int_or_none(widget['_duration']) + if 'clipTitle' in widget: + res['title'] = widget['clipTitle'] + if '_previewImage' in widget: + res['thumbnail'] = widget['_previewImage'] + if 'broadcastedOn' in widget: + res['timestamp'] = unified_timestamp(widget['broadcastedOn']) + if 'synopsis' in widget: + res['description'] = widget['synopsis'] + subtitle_url = url_or_none(widget.get('_subtitleUrl')) + if subtitle_url: + subtitles.setdefault('de', []).append({ + 'ext': 'ttml', + 'url': subtitle_url, + }) + if '_quality' in widget: + format_url = url_or_none(try_get( + widget, lambda x: x['_stream']['json'][0])) + if not format_url: + continue + ext = determine_ext(format_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=3.11.0', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', + fatal=False)) + else: + # HTTP formats are not available when geoblocked is True, + # other formats are fine though + if geoblocked: + continue + quality = str_or_none(widget.get('_quality')) + formats.append({ + 'format_id': ('http-' + quality) if quality else 'http', + 'url': format_url, + 'preference': 10, # Plain HTTP, that's nice + }) + + if not formats and geoblocked: + self.raise_geo_restricted( + msg='This video is not available due to geoblocking', + countries=['DE']) + + self._sort_formats(formats) + res.update({ + 'subtitles': subtitles, + 'formats': formats, + }) + + return res diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py index 594c88c9c..6d71c5ad5 100644 --- a/youtube_dl/extractor/asiancrush.py +++ b/youtube_dl/extractor/asiancrush.py @@ -8,7 +8,6 @@ from .kaltura import KalturaIE from ..utils import ( extract_attributes, remove_end, - urlencode_postdata, ) @@ -34,19 +33,40 @@ class AsianCrushIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'https://www.asiancrush.com/wp-admin/admin-ajax.php', video_id, - data=urlencode_postdata({ - 'postid': video_id, - 'action': 'get_channel_kaltura_vars', - })) + webpage = self._download_webpage(url, video_id) - entry_id = data['entry_id'] + entry_id, partner_id, title = [None] * 3 + + vars = self._parse_json( + self._search_regex( + r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', + default='{}'), video_id, fatal=False) + if vars: + entry_id = vars.get('entry_id') + partner_id = vars.get('partner_id') + title = vars.get('vid_label') + + if not entry_id: + entry_id = self._search_regex( + r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id') + + player = self._download_webpage( + 'https://api.asiancrush.com/embeddedVideoPlayer', video_id, + query={'id': entry_id}) + + kaltura_id = self._search_regex( + r'entry_id["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', player, + 'kaltura id', group='id') + + if not partner_id: + partner_id = self._search_regex( + r'/p(?:artner_id)?/(\d+)', player, 'partner id', + default='513551') return self.url_result( - 'kaltura:%s:%s' % (data['partner_id'], entry_id), - ie=KalturaIE.ie_key(), video_id=entry_id, - video_title=data.get('vid_label')) + 'kaltura:%s:%s' % (partner_id, kaltura_id), + ie=KalturaIE.ie_key(), video_id=kaltura_id, + video_title=title) class AsianCrushPlaylistIE(InfoExtractor): diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py index 1584d53fc..95e572d70 100644 --- a/youtube_dl/extractor/atvat.py +++ b/youtube_dl/extractor/atvat.py @@ -28,8 +28,10 @@ class ATVAtIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_data = self._parse_json(unescapeHTML(self._search_regex( - r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="([^"]+)"', - webpage, 'player data')), display_id)['config']['initial_video'] + [r'flashPlayerOptions\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P[^"]+)"'], + webpage, 'player data', group='json')), + display_id)['config']['initial_video'] video_id = video_data['id'] video_title = video_data['title'] diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 62049b921..cc7771354 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -62,7 +62,7 @@ class AudiomackIE(InfoExtractor): # Audiomack wraps a lot of soundcloud tracks in their branded wrapper # if so, pass the work off to the soundcloud extractor if SoundcloudIE.suitable(api_response['url']): - return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'} + return self.url_result(api_response['url'], SoundcloudIE.ie_key()) return { 'id': compat_str(api_response.get('id', album_url_tag)), diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index 68f26e2ca..fcbdc71b9 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -1,213 +1,86 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import ( - get_element_by_class, - get_element_by_id, - strip_or_none, - urljoin, -) -class AZMedienBaseIE(InfoExtractor): - def _kaltura_video(self, partner_id, entry_id): - return self.url_result( - 'kaltura:%s:%s' % (partner_id, entry_id), ie=KalturaIE.ie_key(), - video_id=entry_id) - - -class AZMedienIE(AZMedienBaseIE): +class AZMedienIE(InfoExtractor): IE_DESC = 'AZ Medien videos' _VALID_URL = r'''(?x) https?:// (?:www\.)? - (?: + (?P telezueri\.ch| telebaern\.tv| telem1\.ch )/ - [0-9]+-show-[^/\#]+ - (?: - /[0-9]+-episode-[^/\#]+ - (?: - /[0-9]+-segment-(?:[^/\#]+\#)?| - \# - )| - \# + [^/]+/ + (?P + [^/]+-(?P\d+) ) - (?P[^\#]+) + (?: + \#video= + (?P + [_0-9a-z]+ + ) + )? ''' _TESTS = [{ - # URL with 'segment' - 'url': 'http://www.telezueri.ch/62-show-zuerinews/13772-episode-sonntag-18-dezember-2016/32419-segment-massenabweisungen-beim-hiltl-club-wegen-pelzboom', + 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', 'info_dict': { - 'id': '1_2444peh4', + 'id': '1_anruz3wy', 'ext': 'mp4', - 'title': 'Massenabweisungen beim Hiltl Club wegen Pelzboom', - 'description': 'md5:9ea9dd1b159ad65b36ddcf7f0d7c76a8', - 'uploader_id': 'TeleZ?ri', - 'upload_date': '20161218', - 'timestamp': 1482084490, + 'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen', + 'uploader_id': 'TVOnline', + 'upload_date': '20180930', + 'timestamp': 1538328802, }, 'params': { 'skip_download': True, }, }, { - # URL with 'segment' and fragment: - 'url': 'http://www.telebaern.tv/118-show-news/14240-episode-dienstag-17-januar-2017/33666-segment-achtung-gefahr#zu-wenig-pflegerinnen-und-pfleger', - 'only_matching': True - }, { - # URL with 'episode' and fragment: - 'url': 'http://www.telem1.ch/47-show-sonntalk/13986-episode-soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz#soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz', - 'only_matching': True - }, { - # URL with 'show' and fragment: - 'url': 'http://www.telezueri.ch/66-show-sonntalk#burka-plakate-trump-putin-china-besuch', + 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - partner_id = self._search_regex( - r']+src=["\'](?:https?:)?//(?:[^/]+\.)?kaltura\.com(?:/[^/]+)*/(?:p|partner_id)/([0-9]+)', - webpage, 'kaltura partner id') - entry_id = self._html_search_regex( - r']+data-id=(["\'])(?P(?:(?!\1).)+)\1[^>]+data-slug=["\']%s' - % re.escape(video_id), webpage, 'kaltura entry id', group='id') - - return self._kaltura_video(partner_id, entry_id) - - -class AZMedienPlaylistIE(AZMedienBaseIE): - IE_DESC = 'AZ Medien playlists' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - telezueri\.ch| - telebaern\.tv| - telem1\.ch - )/ - (?P[0-9]+- - (?: - show| - topic| - themen - )-[^/\#]+ - (?: - /[0-9]+-episode-[^/\#]+ - )? - )$ - ''' - - _TESTS = [{ - # URL with 'episode' - 'url': 'http://www.telebaern.tv/118-show-news/13735-episode-donnerstag-15-dezember-2016', - 'info_dict': { - 'id': '118-show-news/13735-episode-donnerstag-15-dezember-2016', - 'title': 'News - Donnerstag, 15. Dezember 2016', - }, - 'playlist_count': 9, - }, { - # URL with 'themen' - 'url': 'http://www.telem1.ch/258-themen-tele-m1-classics', - 'info_dict': { - 'id': '258-themen-tele-m1-classics', - 'title': 'Tele M1 Classics', - }, - 'playlist_mincount': 15, - }, { - # URL with 'topic', contains nested playlists - 'url': 'http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen', - 'only_matching': True, - }, { - # URL with 'show' only - 'url': 'http://www.telezueri.ch/86-show-talktaeglich', - 'only_matching': True - }] + _PARTNER_ID = '1719221' def _real_extract(self, url): - show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + video_id = mobj.group('id') + entry_id = mobj.group('kaltura_id') - entries = [] + if not entry_id: + api_url = 'https://www.%s/api/pub/gql/%s' % (host, host.split('.')[0]) + payload = { + 'query': '''query VideoContext($articleId: ID!) { + article: node(id: $articleId) { + ... on Article { + mainAssetRelation { + asset { + ... on VideoAsset { + kalturaId + } + } + } + } + } + }''', + 'variables': {'articleId': 'Article:%s' % mobj.group('article_id')}, + } + json_data = self._download_json( + api_url, video_id, headers={ + 'Content-Type': 'application/json', + }, + data=json.dumps(payload).encode()) + entry_id = json_data['data']['article']['mainAssetRelation']['asset']['kalturaId'] - partner_id = self._search_regex( - r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', - webpage, 'kaltura partner id', default=None) - - if partner_id: - entries = [ - self._kaltura_video(partner_id, m.group('id')) - for m in re.finditer( - r'data-id=(["\'])(?P(?:(?!\1).)+)\1', webpage)] - - if not entries: - entries = [ - self.url_result(m.group('url'), ie=AZMedienIE.ie_key()) - for m in re.finditer( - r']+data-real=(["\'])(?Phttp.+?)\1', webpage)] - - if not entries: - entries = [ - # May contain nested playlists (e.g. [1]) thus no explicit - # ie_key - # 1. http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen) - self.url_result(urljoin(url, m.group('url'))) - for m in re.finditer( - r']+name=[^>]+href=(["\'])(?P/.+?)\1', webpage)] - - title = self._search_regex( - r'episodeShareTitle\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'title', - default=strip_or_none(get_element_by_id( - 'video-title', webpage)), group='title') - - return self.playlist_result(entries, show_id, title) - - -class AZMedienShowPlaylistIE(AZMedienBaseIE): - IE_DESC = 'AZ Medien show playlists' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - telezueri\.ch| - telebaern\.tv| - telem1\.ch - )/ - (?: - all-episodes| - alle-episoden - )/ - (?P<id>[^/?#&]+) - ''' - - _TEST = { - 'url': 'http://www.telezueri.ch/all-episodes/astrotalk', - 'info_dict': { - 'id': 'astrotalk', - 'title': 'TeleZüri: AstroTalk - alle episoden', - 'description': 'md5:4c0f7e7d741d906004266e295ceb4a26', - }, - 'playlist_mincount': 13, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - episodes = get_element_by_class('search-mobile-box', webpage) - entries = [self.url_result( - urljoin(url, m.group('url'))) for m in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', episodes)] - title = self._og_search_title(webpage, fatal=False) - description = self._og_search_description(webpage) - return self.playlist_result(entries, playlist_id, title, description) + return self.url_result( + 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), + ie=KalturaIE.ie_key(), video_id=entry_id) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index be41bd5a2..f14b407dc 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import json import random import re import time @@ -16,14 +15,18 @@ from ..utils import ( int_or_none, KNOWN_EXTENSIONS, parse_filesize, + str_or_none, + try_get, unescapeHTML, update_url_query, unified_strdate, + unified_timestamp, + url_or_none, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)' + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', @@ -35,13 +38,44 @@ class BandcampIE(InfoExtractor): }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { + # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '0369ace6b939f0927e62c67a1a8d9fa7', + 'md5': '853e35bf34aa1d6fe2615ae612564b36', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', 'title': 'Ben Prunty - Lanius (Battle)', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Ben Prunty', + 'timestamp': 1396508491, + 'upload_date': '20140403', + 'release_date': '20140403', + 'duration': 260.877, + 'track': 'Lanius (Battle)', + 'track_number': 1, + 'track_id': '2650410135', + 'artist': 'Ben Prunty', + 'album': 'FTL: Advanced Edition Soundtrack', + }, + }, { + # no free download, mp3 128 + 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire', + 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7', + 'info_dict': { + 'id': '2584466013', + 'ext': 'mp3', + 'title': 'Mastodon - Hail to Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Mastodon', + 'timestamp': 1322005399, + 'upload_date': '20111122', + 'release_date': '20040207', + 'duration': 120.79, + 'track': 'Hail to Fire', + 'track_number': 5, + 'track_id': '2584466013', + 'artist': 'Mastodon', + 'album': 'Call of the Mastodon', }, }] @@ -50,19 +84,23 @@ class BandcampIE(InfoExtractor): title = mobj.group('title') webpage = self._download_webpage(url, title) thumbnail = self._html_search_meta('og:image', webpage, default=None) - m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) - if not m_download: - m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) - if m_trackinfo: - json_code = m_trackinfo.group(1) - data = json.loads(json_code)[0] - track_id = compat_str(data['id']) - if not data.get('file'): - raise ExtractorError('Not streamable', video_id=track_id, expected=True) + track_id = None + track = None + track_number = None + duration = None - formats = [] - for format_id, format_url in data['file'].items(): + formats = [] + track_info = self._parse_json( + self._search_regex( + r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', + webpage, 'track info', default='{}'), title) + if track_info: + file_ = track_info.get('file') + if isinstance(file_, dict): + for format_id, format_url in file_.items(): + if not url_or_none(format_url): + continue ext, abr_str = format_id.split('-', 1) formats.append({ 'format_id': format_id, @@ -72,85 +110,110 @@ class BandcampIE(InfoExtractor): 'acodec': ext, 'abr': int_or_none(abr_str), }) + track = track_info.get('title') + track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_number = int_or_none(track_info.get('track_num')) + duration = float_or_none(track_info.get('duration')) - self._sort_formats(formats) + def extract(key): + return self._search_regex( + r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, + webpage, key, default=None, group='value') - return { - 'id': track_id, - 'title': data['title'], - 'thumbnail': thumbnail, - 'formats': formats, - 'duration': float_or_none(data.get('duration')), - } - else: - raise ExtractorError('No free songs found') + artist = extract('artist') + album = extract('album_title') + timestamp = unified_timestamp( + extract('publish_date') or extract('album_publish_date')) + release_date = unified_strdate(extract('album_release_date')) - download_link = m_download.group(1) - video_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', - webpage, 'video id') + download_link = self._search_regex( + r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'download link', default=None, group='url') + if download_link: + track_id = self._search_regex( + r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', + webpage, 'track id') - download_webpage = self._download_webpage( - download_link, video_id, 'Downloading free downloads page') + download_webpage = self._download_webpage( + download_link, track_id, 'Downloading free downloads page') - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, - 'blob', group='blob'), - video_id, transform_source=unescapeHTML) + blob = self._parse_json( + self._search_regex( + r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, + 'blob', group='blob'), + track_id, transform_source=unescapeHTML) - info = blob['digital_items'][0] + info = try_get( + blob, (lambda x: x['digital_items'][0], + lambda x: x['download_items'][0]), dict) + if info: + downloads = info.get('downloads') + if isinstance(downloads, dict): + if not track: + track = info.get('title') + if not artist: + artist = info.get('artist') + if not thumbnail: + thumbnail = info.get('thumb_url') - downloads = info['downloads'] - track = info['title'] + download_formats = {} + download_formats_list = blob.get('download_formats') + if isinstance(download_formats_list, list): + for f in blob['download_formats']: + name, ext = f.get('name'), f.get('file_extension') + if all(isinstance(x, compat_str) for x in (name, ext)): + download_formats[name] = ext.strip('.') - artist = info.get('artist') - title = '%s - %s' % (artist, track) if artist else track + for format_id, f in downloads.items(): + format_url = f.get('url') + if not format_url: + continue + # Stat URL generation algorithm is reverse engineered from + # download_*_bundle_*.js + stat_url = update_url_query( + format_url.replace('/download/', '/statdownload/'), { + '.rand': int(time.time() * 1000 * random.random()), + }) + format_id = f.get('encoding_name') or format_id + stat = self._download_json( + stat_url, track_id, 'Downloading %s JSON' % format_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1], + fatal=False) + if not stat: + continue + retry_url = url_or_none(stat.get('retry_url')) + if not retry_url: + continue + formats.append({ + 'url': self._proto_relative_url(retry_url, 'http:'), + 'ext': download_formats.get(format_id), + 'format_id': format_id, + 'format_note': f.get('description'), + 'filesize': parse_filesize(f.get('size_mb')), + 'vcodec': 'none', + }) - download_formats = {} - for f in blob['download_formats']: - name, ext = f.get('name'), f.get('file_extension') - if all(isinstance(x, compat_str) for x in (name, ext)): - download_formats[name] = ext.strip('.') - - formats = [] - for format_id, f in downloads.items(): - format_url = f.get('url') - if not format_url: - continue - # Stat URL generation algorithm is reverse engineered from - # download_*_bundle_*.js - stat_url = update_url_query( - format_url.replace('/download/', '/statdownload/'), { - '.rand': int(time.time() * 1000 * random.random()), - }) - format_id = f.get('encoding_name') or format_id - stat = self._download_json( - stat_url, video_id, 'Downloading %s JSON' % format_id, - transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1], - fatal=False) - if not stat: - continue - retry_url = stat.get('retry_url') - if not isinstance(retry_url, compat_str): - continue - formats.append({ - 'url': self._proto_relative_url(retry_url, 'http:'), - 'ext': download_formats.get(format_id), - 'format_id': format_id, - 'format_note': f.get('description'), - 'filesize': parse_filesize(f.get('size_mb')), - 'vcodec': 'none', - }) self._sort_formats(formats) + title = '%s - %s' % (artist, track) if artist else track + + if not duration: + duration = float_or_none(self._html_search_meta( + 'duration', webpage, default=None)) + return { - 'id': video_id, + 'id': track_id, 'title': title, - 'thumbnail': info.get('thumb_url') or thumbnail, - 'uploader': info.get('artist'), - 'artist': artist, + 'thumbnail': thumbnail, + 'uploader': artist, + 'timestamp': timestamp, + 'release_date': release_date, + 'duration': duration, 'track': track, + 'track_number': track_number, + 'track_id': track_id, + 'artist': artist, + 'album': album, 'formats': formats, } @@ -306,7 +369,7 @@ class BandcampWeeklyIE(InfoExtractor): formats = [] for format_id, format_url in show['audio_stream'].items(): - if not isinstance(format_url, compat_str): + if not url_or_none(format_url): continue for known_ext in KNOWN_EXTENSIONS: if known_ext in format_id: diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 30a63a24e..eac9a5a46 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -21,7 +21,6 @@ from ..utils import ( urljoin, ) from ..compat import ( - compat_etree_fromstring, compat_HTTPError, compat_urlparse, ) @@ -30,7 +29,7 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'[pbw][\da-z]{7}' + _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -237,6 +236,12 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/m00005xn', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s', + 'only_matching': True, }] _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' @@ -334,14 +339,9 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - try: - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404): - media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) - else: - raise + media_selection = self._download_xml( + url, programme_id, 'Downloading media selection XML', + expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) def _process_media_selector(self, media_selection, programme_id): @@ -784,6 +784,26 @@ class BBCIE(BBCCoUkIE): 'params': { 'skip_download': True, } + }, { + # window.__PRELOADED_STATE__ + 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', + 'info_dict': { + 'id': 'b0b9z4vz', + 'ext': 'mp4', + 'title': 'Prom 6: An American in Paris and Turangalila', + 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8', + 'uploader': 'Radio 3', + 'uploader_id': 'bbc_radio_three', + }, + }, { + 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', + 'info_dict': { + 'id': 'p06w9tws', + 'ext': 'mp4', + 'title': 'md5:2fabf12a726603193a2879a055f72514', + 'description': 'Learn English words and phrases from this story', + }, + 'add_ie': [BBCCoUkIE.ie_key()], }] @classmethod @@ -934,6 +954,15 @@ class BBCIE(BBCCoUkIE): if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 + group_id = self._search_regex( + r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, + webpage, 'group id', default=None) + if playlist_id: + return self.url_result( + 'https://www.bbc.co.uk/programmes/%s' % group_id, + ie=BBCCoUkIE.ie_key()) + # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, @@ -1006,6 +1035,36 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + preload_state = self._parse_json(self._search_regex( + r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if preload_state: + current_programme = preload_state.get('programmes', {}).get('current') or {} + programme_id = current_programme.get('id') + if current_programme and programme_id and current_programme.get('type') == 'playable_item': + title = current_programme.get('titles', {}).get('tertiary') or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + synopses = current_programme.get('synopses') or {} + network = current_programme.get('network') or {} + duration = int_or_none( + current_programme.get('duration', {}).get('value')) + thumbnail = None + image_url = current_programme.get('image_url') + if image_url: + thumbnail = image_url.replace('{recipe}', '1920x1920') + return { + 'id': programme_id, + 'title': title, + 'description': dict_get(synopses, ('long', 'medium', 'short')), + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': network.get('short_title'), + 'uploader_id': network.get('id'), + 'formats': formats, + 'subtitles': subtitles, + } + bbc3_config = self._parse_json( self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index bf22a41b7..1086d7632 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -1,15 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_chr, - compat_ord, - compat_urllib_parse_unquote, -) +from ..compat import compat_str from ..utils import ( int_or_none, - parse_iso8601, - urljoin, + unified_timestamp, ) @@ -36,29 +31,9 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - cpl_url = self._search_regex( - r'<script[^>]+src=(["\'])(?P<url>(?:/static|(?:https?:)?//static\.beeg\.com)/cpl/\d+\.js.*?)\1', - webpage, 'cpl', default=None, group='url') - - cpl_url = urljoin(url, cpl_url) - - beeg_version, beeg_salt = [None] * 2 - - if cpl_url: - cpl = self._download_webpage( - self._proto_relative_url(cpl_url), video_id, - 'Downloading cpl JS', fatal=False) - if cpl: - beeg_version = int_or_none(self._search_regex( - r'beeg_version\s*=\s*([^\b]+)', cpl, - 'beeg version', default=None)) or self._search_regex( - r'/(\d+)\.js', cpl_url, 'beeg version', default=None) - beeg_salt = self._search_regex( - r'beeg_salt\s*=\s*(["\'])(?P<beeg_salt>.+?)\1', cpl, 'beeg salt', - default=None, group='beeg_salt') - - beeg_version = beeg_version or '2185' - beeg_salt = beeg_salt or 'pmweAkq8lAYKdfWcFCUj0yoVgoPlinamH5UE1CB3H' + beeg_version = self._search_regex( + r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version', + default='1546225636701') for api_path in ('', 'api.'): video = self._download_json( @@ -68,37 +43,6 @@ class BeegIE(InfoExtractor): if video: break - def split(o, e): - def cut(s, x): - n.append(s[:x]) - return s[x:] - n = [] - r = len(o) % e - if r > 0: - o = cut(o, r) - while len(o) > e: - o = cut(o, e) - n.append(o) - return n - - def decrypt_key(key): - # Reverse engineered from http://static.beeg.com/cpl/1738.js - a = beeg_salt - e = compat_urllib_parse_unquote(key) - o = ''.join([ - compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21) - for n in range(len(e))]) - return ''.join(split(o, 3)[::-1]) - - def decrypt_url(encrypted_url): - encrypted_url = self._proto_relative_url( - encrypted_url.replace('{DATA_MARKERS}', ''), 'https:') - key = self._search_regex( - r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None) - if not key: - return encrypted_url - return encrypted_url.replace(key, decrypt_key(key)) - formats = [] for format_id, video_url in video.items(): if not video_url: @@ -108,18 +52,20 @@ class BeegIE(InfoExtractor): if not height: continue formats.append({ - 'url': decrypt_url(video_url), + 'url': self._proto_relative_url( + video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'), 'format_id': format_id, 'height': int(height), }) self._sort_formats(formats) title = video['title'] - video_id = video.get('id') or video_id + video_id = compat_str(video.get('id') or video_id) display_id = video.get('code') description = video.get('desc') + series = video.get('ps_name') - timestamp = parse_iso8601(video.get('date'), ' ') + timestamp = unified_timestamp(video.get('date')) duration = int_or_none(video.get('duration')) tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None @@ -129,6 +75,7 @@ class BeegIE(InfoExtractor): 'display_id': display_id, 'title': title, 'description': description, + 'series': series, 'timestamp': timestamp, 'duration': duration, 'tags': tags, diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 3e3348ef5..3746671d3 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -93,8 +93,8 @@ class BiliBiliIE(InfoExtractor): }] }] - _APP_KEY = '84956560bc028eb7' - _BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e' + _APP_KEY = 'iVGUTjsxvpLeuDCf' + _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' def _report_error(self, result): if 'message' in result: @@ -114,7 +114,7 @@ class BiliBiliIE(InfoExtractor): if 'anime/' not in url: cid = self._search_regex( - r'cid(?:["\']:|=)(\d+)', webpage, 'cid', + r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', default=None ) or compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py new file mode 100644 index 000000000..4f39424f5 --- /dev/null +++ b/youtube_dl/extractor/bitchute.py @@ -0,0 +1,129 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( + orderedSet, + urlencode_postdata, +) + + +class BitChuteIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.bitchute.com/video/szoMrox2JEI/', + 'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb', + 'info_dict': { + 'id': 'szoMrox2JEI', + 'ext': 'mp4', + 'title': 'Fuck bitches get money', + 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Victoria X Rave', + }, + }, { + 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', + 'only_matching': True, + }, { + 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', + }) + + title = self._html_search_regex( + (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'), + webpage, 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', + default=None) or self._og_search_description(webpage) + + format_urls = [] + for mobj in re.finditer( + r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): + format_urls.append(mobj.group('url')) + format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) + + formats = [ + {'url': format_url} + for format_url in orderedSet(format_urls)] + self._check_formats(formats, video_id) + self._sort_formats(formats) + + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image:src', webpage, 'thumbnail') + uploader = self._html_search_regex( + r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>', webpage, + 'uploader', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'formats': formats, + } + + +class BitChuteChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.bitchute.com/channel/victoriaxrave/', + 'playlist_mincount': 185, + 'info_dict': { + 'id': 'victoriaxrave', + }, + } + + _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' + + def _entries(self, channel_id): + channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id + offset = 0 + for page_num in itertools.count(1): + data = self._download_json( + '%sextend/' % channel_url, channel_id, + 'Downloading channel page %d' % page_num, + data=urlencode_postdata({ + 'csrfmiddlewaretoken': self._TOKEN, + 'name': '', + 'offset': offset, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': channel_url, + 'X-Requested-With': 'XMLHttpRequest', + 'Cookie': 'csrftoken=%s' % self._TOKEN, + }) + if data.get('success') is False: + break + html = data.get('html') + if not html: + break + video_ids = re.findall( + r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', + html) + if not video_ids: + break + offset += len(video_ids) + for video_id in video_ids: + yield self.url_result( + 'https://www.bitchute.com/video/%s' % video_id, + ie=BitChuteIE.ie_key(), video_id=video_id) + + def _real_extract(self, url): + channel_id = self._match_id(url) + return self.playlist_result( + self._entries(channel_id), playlist_id=channel_id) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 70d16767f..68c7cf2bb 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -4,8 +4,10 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE -from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + int_or_none, + url_or_none, +) class BreakIE(InfoExtractor): @@ -55,8 +57,8 @@ class BreakIE(InfoExtractor): formats = [] for video in content: - video_url = video.get('url') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(video.get('url')) + if not video_url: continue bitrate = int_or_none(self._search_regex( r'(\d+)_kbps', video_url, 'tbr', default=None)) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ab62e54d6..465ae396e 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,8 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import re +import base64 import json +import re +import struct from .common import InfoExtractor from .adobepass import AdobePassIE @@ -310,6 +312,10 @@ class BrightcoveLegacyIE(InfoExtractor): 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', expected=True) + def _brightcove_new_url_result(self, publisher_id, video_id): + brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) + return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) + def _get_video_info(self, video_id, query, referer=None): headers = {} linkBase = query.get('linkBaseURL') @@ -323,6 +329,28 @@ class BrightcoveLegacyIE(InfoExtractor): r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage, 'error message', default=None) if error_msg is not None: + publisher_id = query.get('publisherId') + if publisher_id and publisher_id[0].isdigit(): + publisher_id = publisher_id[0] + if not publisher_id: + player_key = query.get('playerKey') + if player_key and ',' in player_key[0]: + player_key = player_key[0] + else: + player_id = query.get('playerID') + if player_id and player_id[0].isdigit(): + player_page = self._download_webpage( + 'http://link.brightcove.com/services/player/bcpid' + player_id[0], + video_id, headers=headers, fatal=False) + if player_page: + player_key = self._search_regex( + r'<param\s+name="playerKey"\s+value="([\w~,-]+)"', + player_page, 'player key', fatal=False) + if player_key: + enc_pub_id = player_key.split(',')[1].replace('~', '=') + publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0] + if publisher_id: + return self._brightcove_new_url_result(publisher_id, video_id) raise ExtractorError( 'brightcove said: %s' % error_msg, expected=True) @@ -444,8 +472,12 @@ class BrightcoveLegacyIE(InfoExtractor): else: return ad_info - if 'url' not in info and not info.get('formats'): - raise ExtractorError('Unable to extract video url for %s' % video_id) + if not info.get('url') and not info.get('formats'): + uploader_id = info.get('uploader_id') + if uploader_id: + info.update(self._brightcove_new_url_result(uploader_id, video_id)) + else: + raise ExtractorError('Unable to extract video url for %s' % video_id) return info @@ -572,7 +604,8 @@ class BrightcoveNewIE(AdobePassIE): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - if ext == 'ism' or container == 'WVM': + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if ext == 'ism' or container == 'WVM' or source.get('key_systems'): continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -629,6 +662,14 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) + if not formats: + # for sonyliv.com DRM protected videos + s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') + if s3_source_url: + formats.append({ + 'url': s3_source_url, + 'format_id': 'source', + }) errors = json_data.get('errors') if not formats and errors: diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index ee0165dba..1eb81b75e 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -2,10 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + url_or_none, ) @@ -14,6 +14,7 @@ class CamModelsIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.cammodels.com/cam/AutumnKnight/', 'only_matching': True, + 'age_limit': 18 }] def _real_extract(self, url): @@ -56,8 +57,8 @@ class CamModelsIE(InfoExtractor): for media in encodings: if not isinstance(media, dict): continue - media_url = media.get('location') - if not media_url or not isinstance(media_url, compat_str): + media_url = url_or_none(media.get('location')) + if not media_url: continue format_id_list = [format_id] @@ -93,4 +94,5 @@ class CamModelsIE(InfoExtractor): 'title': self._live_title(user_id), 'is_live': True, 'formats': formats, + 'age_limit': 18 } diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py index c7d40f849..b3be3bdcf 100644 --- a/youtube_dl/extractor/camtube.py +++ b/youtube_dl/extractor/camtube.py @@ -20,6 +20,7 @@ class CamTubeIE(InfoExtractor): 'duration': 1274, 'timestamp': 1528018608, 'upload_date': '20180603', + 'age_limit': 18 }, 'params': { 'skip_download': True, @@ -66,4 +67,5 @@ class CamTubeIE(InfoExtractor): 'like_count': like_count, 'creator': creator, 'formats': formats, + 'age_limit': 18 } diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py index afbc5ea26..bbc5205fd 100644 --- a/youtube_dl/extractor/camwithher.py +++ b/youtube_dl/extractor/camwithher.py @@ -25,6 +25,7 @@ class CamWithHerIE(InfoExtractor): 'comment_count': int, 'uploader': 'MileenaK', 'upload_date': '20160322', + 'age_limit': 18, }, 'params': { 'skip_download': True, @@ -84,4 +85,5 @@ class CamWithHerIE(InfoExtractor): 'comment_count': comment_count, 'uploader': uploader, 'upload_date': upload_date, + 'age_limit': 18 } diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8ac62c1a6..174fd9e2b 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -11,6 +11,7 @@ from ..utils import ( strip_or_none, float_or_none, int_or_none, + merge_dicts, parse_iso8601, ) @@ -248,9 +249,13 @@ class VrtNUIE(GigyaBaseIE): webpage, urlh = self._download_webpage_handle(url, display_id) - title = self._html_search_regex( + info = self._search_json_ld(webpage, display_id, default={}) + + # title is optional here since it may be extracted by extractor + # that is delegated from here + title = strip_or_none(self._html_search_regex( r'(?ms)<h1 class="content__heading">(.+?)</h1>', - webpage, 'title').strip() + webpage, 'title', default=None)) description = self._html_search_regex( r'(?ms)<div class="content__description">(.+?)</div>', @@ -295,7 +300,7 @@ class VrtNUIE(GigyaBaseIE): # the first one video_id = list(video.values())[0].get('videoid') - return { + return merge_dicts(info, { '_type': 'url_transparent', 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 'ie_key': CanvasIE.ie_key(), @@ -307,4 +312,4 @@ class VrtNUIE(GigyaBaseIE): 'season_number': season_number, 'episode_number': episode_number, 'release_date': release_date, - } + }) diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py index 9ba909a91..b57b86af7 100644 --- a/youtube_dl/extractor/carambatv.py +++ b/youtube_dl/extractor/carambatv.py @@ -82,6 +82,12 @@ class CarambaTVPageIE(InfoExtractor): webpage = self._download_webpage(url, video_id) videomore_url = VideomoreIE._extract_url(webpage) + if not videomore_url: + videomore_id = self._search_regex( + r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id', + default=None) + if videomore_id: + videomore_url = 'videomore:%s' % videomore_id if videomore_url: title = self._og_search_title(webpage) return { diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py index 6aeebd7b3..48b33617f 100644 --- a/youtube_dl/extractor/cartoonnetwork.py +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -1,20 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .turner import TurnerBaseIE +from ..utils import int_or_none class CartoonNetworkIE(TurnerBaseIE): _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html' _TEST = { - 'url': 'http://www.cartoonnetwork.com/video/teen-titans-go/starfire-the-cat-lady-clip.html', + 'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html', 'info_dict': { - 'id': '8a250ab04ed07e6c014ef3f1e2f9016c', + 'id': '6e3375097f63874ebccec7ef677c1c3845fa850e', 'ext': 'mp4', - 'title': 'Starfire the Cat Lady', - 'description': 'Robin decides to become a cat so that Starfire will finally love him.', + 'title': 'How to Draw Upgrade', + 'description': 'md5:2061d83776db7e8be4879684eefe8c0f', }, 'params': { # m3u8 download @@ -25,18 +24,39 @@ class CartoonNetworkIE(TurnerBaseIE): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - id_type, video_id = re.search(r"_cnglobal\.cvp(Video|Title)Id\s*=\s*'([^']+)';", webpage).groups() - query = ('id' if id_type == 'Video' else 'titleId') + '=' + video_id - return self._extract_cvp_info( - 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, { - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/toon/big', - 'tokenizer_src': 'https://token.vgtf.net/token/token_mobile', - }, - }, { + + def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): + metadata_re = '' + if content_re: + metadata_re = r'|video_metadata\.content_' + content_re + return self._search_regex( + r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re), + webpage, name, fatal=fatal) + + media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) + title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True) + + info = self._extract_ngtv_info( + media_id, {'networkId': 'cartoonnetwork'}, { 'url': url, 'site_name': 'CartoonNetwork', - 'auth_required': self._search_regex( - r'_cnglobal\.cvpFullOrPreviewAuth\s*=\s*(true|false);', - webpage, 'auth required', default='false') == 'true', + 'auth_required': find_field('authType', 'auth type') != 'unauth', }) + + series = find_field( + 'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage) + info.update({ + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta('description', webpage), + 'series': series, + 'episode': title, + }) + + for field in ('season', 'episode'): + field_name = field + 'Number' + info[field + '_number'] = int_or_none(find_field( + field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage)) + + return info diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 07f5206c1..544647f92 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, int_or_none, parse_duration, parse_iso8601, parse_resolution, + url_or_none, ) @@ -53,8 +53,8 @@ class CCMAIE(InfoExtractor): media_url = media['media']['url'] if isinstance(media_url, list): for format_ in media_url: - format_url = format_.get('file') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_.get('file')) + if not format_url: continue label = format_.get('label') f = parse_resolution(label) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 6bad90859..46380430f 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -108,7 +108,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py new file mode 100644 index 000000000..c99b6ee58 --- /dev/null +++ b/youtube_dl/extractor/ciscolive.py @@ -0,0 +1,142 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + try_get, + urlencode_postdata, +) + + +class CiscoLiveBaseIE(InfoExtractor): + # These appear to be constant across all Cisco Live presentations + # and are not tied to any user session or event + RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' + RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' + RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' + + HEADERS = { + 'Origin': 'https://ciscolive.cisco.com', + 'rfApiProfileId': RAINFOCUS_API_PROFILE_ID, + 'rfWidgetId': RAINFOCUS_WIDGET_ID, + } + + def _call_api(self, ep, rf_id, query, referrer, note=None): + headers = self.HEADERS.copy() + headers['Referer'] = referrer + return self._download_json( + self.RAINFOCUS_API_URL % ep, rf_id, note=note, + data=urlencode_postdata(query), headers=headers) + + def _parse_rf_item(self, rf_item): + event_name = rf_item.get('eventName') + title = rf_item['title'] + description = clean_html(rf_item.get('abstract')) + presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) + bc_id = rf_item['videos'][0]['url'] + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id + duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) + location = try_get(rf_item, lambda x: x['times'][0]['room']) + + if duration: + duration = duration * 60 + + return { + '_type': 'url_transparent', + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + 'title': title, + 'description': description, + 'duration': duration, + 'creator': presenter_name, + 'location': location, + 'series': event_name, + } + + +class CiscoLiveSessionIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://ciscolive\.cisco\.com/on-demand-library/\??[^#]*#/session/(?P<id>[^/?&]+)' + _TEST = { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', + 'md5': 'c98acf395ed9c9f766941c70f5352e22', + 'info_dict': { + 'id': '5803694304001', + 'ext': 'mp4', + 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', + 'description': 'md5:ec4a436019e09a918dec17714803f7cc', + 'timestamp': 1530305395, + 'upload_date': '20180629', + 'uploader_id': '5647924234001', + 'location': '16B Mezz.', + }, + } + + def _real_extract(self, url): + rf_id = self._match_id(url) + rf_result = self._call_api('session', rf_id, {'id': rf_id}, url) + return self._parse_rf_item(rf_result['items'][0]) + + +class CiscoLiveSearchIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://ciscolive\.cisco\.com/on-demand-library/' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', + 'info_dict': { + 'title': 'Search query', + }, + 'playlist_count': 5, + }, { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url) + + @staticmethod + def _check_bc_id_exists(rf_item): + return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None + + def _entries(self, query, url): + query['size'] = 50 + query['from'] = 0 + for page_num in itertools.count(1): + results = self._call_api( + 'search', None, query, url, + 'Downloading search JSON page %d' % page_num) + sl = try_get(results, lambda x: x['sectionList'][0], dict) + if sl: + results = sl + items = results.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + if not self._check_bc_id_exists(item): + continue + yield self._parse_rf_item(item) + size = int_or_none(results.get('size')) + if size is not None: + query['size'] = size + total = int_or_none(results.get('total')) + if total is not None and query['from'] + query['size'] > total: + break + query['from'] += query['size'] + + def _real_extract(self, url): + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + query['type'] = 'session' + return self.playlist_result( + self._entries(query, url), playlist_title='Search query') diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index ab651d1c8..f2ca7a337 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -1,19 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none - - -_translation_table = { - 'a': 'h', 'd': 'e', 'e': 'v', 'f': 'o', 'g': 'f', 'i': 'd', 'l': 'n', - 'm': 'a', 'n': 'm', 'p': 'u', 'q': 't', 'r': 's', 'v': 'p', 'x': 'r', - 'y': 'l', 'z': 'i', - '$': ':', '&': '.', '(': '=', '^': '&', '=': '/', -} - - -def _decode(s): - return ''.join(_translation_table.get(c, c) for c in s) +from ..utils import ( + int_or_none, + url_or_none, +) class CliphunterIE(InfoExtractor): @@ -60,14 +51,14 @@ class CliphunterIE(InfoExtractor): formats = [] for format_id, f in gexo_files.items(): - video_url = f.get('url') + video_url = url_or_none(f.get('url')) if not video_url: continue fmt = f.get('fmt') height = f.get('h') format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id formats.append({ - 'url': _decode(video_url), + 'url': video_url, 'format_id': format_id, 'width': int_or_none(f.get('w')), 'height': int_or_none(height), diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py index 57e643799..06d04de13 100644 --- a/youtube_dl/extractor/clyp.py +++ b/youtube_dl/extractor/clyp.py @@ -1,15 +1,19 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( float_or_none, - parse_iso8601, + unified_timestamp, ) class ClypIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://clyp.it/ojz2wfah', 'md5': '1d4961036c41247ecfdcc439c0cddcbb', 'info_dict': { @@ -21,13 +25,34 @@ class ClypIE(InfoExtractor): 'timestamp': 1443515251, 'upload_date': '20150929', }, - } + }, { + 'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d', + 'info_dict': { + 'id': 'b04p1odi', + 'ext': 'mp3', + 'title': 'GJ! (Reward Edit)', + 'description': 'Metal Resistance (THE ONE edition)', + 'duration': 177.789, + 'timestamp': 1528241278, + 'upload_date': '20180605', + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): audio_id = self._match_id(url) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + token = qs.get('token', [None])[0] + + query = {} + if token: + query['token'] = token + metadata = self._download_json( - 'https://api.clyp.it/%s' % audio_id, audio_id) + 'https://api.clyp.it/%s' % audio_id, audio_id, query=query) formats = [] for secure in ('', 'Secure'): @@ -45,7 +70,7 @@ class ClypIE(InfoExtractor): title = metadata['Title'] description = metadata.get('Description') duration = float_or_none(metadata.get('Duration')) - timestamp = parse_iso8601(metadata.get('DateCreated')) + timestamp = unified_timestamp(metadata.get('DateCreated')) return { 'id': audio_id, diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py index d354d9f95..6889b0f40 100644 --- a/youtube_dl/extractor/cnbc.py +++ b/youtube_dl/extractor/cnbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor from ..utils import smuggle_url @@ -34,3 +35,32 @@ class CNBCIE(InfoExtractor): {'force_smil_url': True}), 'id': video_id, } + + +class CNBCVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' + _TEST = { + 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', + 'info_dict': { + 'id': '7000031301', + 'ext': 'mp4', + 'title': "Trump: I don't necessarily agree with raising rates", + 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', + 'timestamp': 1531958400, + 'upload_date': '20180719', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, + 'video id') + return self.url_result( + 'http://video.cnbc.com/gallery/?video=%s' % video_id, + CNBCIE.ie_key()) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5fc311f53..774b71055 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -119,11 +119,7 @@ class CNNBlogsIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') - return { - '_type': 'url', - 'url': cnn_url, - 'ie_key': CNNIE.ie_key(), - } + return self.url_result(cnn_url, CNNIE.ie_key()) class CNNArticleIE(InfoExtractor): @@ -145,8 +141,4 @@ class CNNArticleIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') - return { - '_type': 'url', - 'url': 'http://cnn.com/video/?/video/' + cnn_url, - 'ie_key': CNNIE.ie_key(), - } + return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a2548dba3..c3b0586a0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -19,6 +19,7 @@ from ..compat import ( compat_cookies, compat_etree_fromstring, compat_getpass, + compat_integer_types, compat_http_client, compat_os_name, compat_str, @@ -51,6 +52,7 @@ from ..utils import ( GeoUtils, int_or_none, js_to_json, + JSON_LD_RE, mimetype2ext, orderedSet, parse_codecs, @@ -67,6 +69,7 @@ from ..utils import ( update_url_query, urljoin, url_basename, + url_or_none, xpath_element, xpath_text, xpath_with_ns, @@ -209,6 +212,11 @@ class InfoExtractor(object): If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. + channel: Full name of the channel the video is uploaded on. + Note that channel fields may or may not repeat uploader + fields. This depends on a particular extractor. + channel_id: Id of the channel. + channel_url: Full URL to a channel webpage. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -548,8 +556,26 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): - """ Returns the response handle """ + @staticmethod + def __can_accept_status_code(err, expected_status): + assert isinstance(err, compat_urllib_error.HTTPError) + if expected_status is None: + return False + if isinstance(expected_status, compat_integer_types): + return err.code == expected_status + elif isinstance(expected_status, (list, tuple)): + return err.code in expected_status + elif callable(expected_status): + return expected_status(err.code) is True + else: + assert False + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + """ + Return the response handle. + + See _download_webpage docstring for arguments specification. + """ if note is None: self.report_download_webpage(video_id) elif note is not False: @@ -578,6 +604,15 @@ class InfoExtractor(object): try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if isinstance(err, compat_urllib_error.HTTPError): + if self.__can_accept_status_code(err, expected_status): + # Retain reference to error to prevent file object from + # being closed before it can be read. Works around the + # effects of <https://bugs.python.org/issue15002> + # introduced in Python 3.4.1. + err.fp._error = err + return err.fp + if errnote is False: return False if errnote is None: @@ -590,13 +625,17 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): - """ Returns a tuple (page content as string, URL handle) """ + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + """ + Return a tuple (page content as string, URL handle). + + See _download_webpage docstring for arguments specification. + """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) if urlh is False: assert not fatal return False @@ -685,13 +724,52 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): - """ Returns the data of the page as a string """ + def _download_webpage( + self, url_or_request, video_id, note=None, errnote=None, + fatal=True, tries=1, timeout=5, encoding=None, data=None, + headers={}, query={}, expected_status=None): + """ + Return the data of the page as a string. + + Arguments: + url_or_request -- plain text URL as a string or + a compat_urllib_request.Requestobject + video_id -- Video/playlist/item identifier (string) + + Keyword arguments: + note -- note printed before downloading (string) + errnote -- note printed in case of an error (string) + fatal -- flag denoting whether error should be considered fatal, + i.e. whether it should cause ExtractionError to be raised, + otherwise a warning will be reported and extraction continued + tries -- number of tries + timeout -- sleep interval between tries + encoding -- encoding for a page content decoding, guessed automatically + when not explicitly specified + data -- POST data (bytes) + headers -- HTTP headers (dict) + query -- URL query (dict) + expected_status -- allows to accept failed HTTP requests (non 2xx + status code) by explicitly specifying a set of accepted status + codes. Can be any of the following entities: + - an integer type specifying an exact failed status code to + accept + - a list or a tuple of integer types specifying a list of + failed status codes to accept + - a callable accepting an actual failed status code and + returning True if it should be accepted + Note that this argument does not affect success status codes (2xx) + which are always accepted. + """ + success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -707,11 +785,17 @@ class InfoExtractor(object): def _download_xml_handle( self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res xml_string, urlh = res @@ -719,15 +803,21 @@ class InfoExtractor(object): xml_string, video_id, transform_source=transform_source, fatal=fatal), urlh - def _download_xml(self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}): - """Return the xml as an xml.etree.ElementTree.Element""" + def _download_xml( + self, url_or_request, video_id, + note='Downloading XML', errnote='Unable to download XML', + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}, expected_status=None): + """ + Return the xml as an xml.etree.ElementTree.Element. + + See _download_webpage docstring for arguments specification. + """ res = self._download_xml_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): @@ -745,11 +835,17 @@ class InfoExtractor(object): def _download_json_handle( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (JSON object, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (JSON object, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res json_string, urlh = res @@ -760,11 +856,18 @@ class InfoExtractor(object): def _download_json( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return the JSON object as a dict. + + See _download_webpage docstring for arguments specification. + """ res = self._download_json_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): @@ -955,7 +1058,7 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' % {'prop': re.escape(prop)}) template = r'<meta[^>]+?%s[^>]+?%s' return [ @@ -1058,8 +1161,7 @@ class InfoExtractor(object): def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( - r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', - html, 'JSON-LD', group='json_ld', **kwargs) + JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) default = kwargs.get('default', NO_DEFAULT) if not json_ld: return default if default is not NO_DEFAULT else {} @@ -1117,10 +1219,10 @@ class InfoExtractor(object): def extract_video_object(e): assert e['@type'] == 'VideoObject' info.update({ - 'url': e.get('contentUrl'), + 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), + 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), 'filesize': float_or_none(e.get('contentSize')), @@ -1137,17 +1239,30 @@ class InfoExtractor(object): if expected_type is not None and expected_type != item_type: return info if item_type in ('TVEpisode', 'Episode'): + episode_name = unescapeHTML(e.get('name')) info.update({ - 'episode': unescapeHTML(e.get('name')), + 'episode': episode_name, 'episode_number': int_or_none(e.get('episodeNumber')), 'description': unescapeHTML(e.get('description')), }) + if not info.get('title') and episode_name: + info['title'] = episode_name part_of_season = e.get('partOfSeason') if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): - info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + info.update({ + 'season': unescapeHTML(part_of_season.get('name')), + 'season_number': int_or_none(part_of_season.get('seasonNumber')), + }) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Movie': + info.update({ + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('dateCreated')), + }) elif item_type in ('Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), @@ -1484,6 +1599,7 @@ class InfoExtractor(object): # References: # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 # 2. https://github.com/rg3/youtube-dl/issues/12211 + # 3. https://github.com/rg3/youtube-dl/issues/18923 # We should try extracting formats only from master playlists [1, 4.3.4], # i.e. playlists that describe available qualities. On the other hand @@ -1555,11 +1671,16 @@ class InfoExtractor(object): rendition = stream_group[0] return rendition.get('NAME') or stream_group_id + # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the + # chance to detect video only formats when EXT-X-STREAM-INF tags + # precede EXT-X-MEDIA tags in HLS manifest such as [3]. + for line in m3u8_doc.splitlines(): + if line.startswith('#EXT-X-MEDIA:'): + extract_media(line) + for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): last_stream_inf = parse_m3u8_attributes(line) - elif line.startswith('#EXT-X-MEDIA:'): - extract_media(line) elif line.startswith('#') or not line.strip(): continue else: @@ -1610,9 +1731,9 @@ class InfoExtractor(object): # However, this is not always respected, for example, [2] # contains EXT-X-STREAM-INF tag which references AUDIO # rendition group but does not have CODECS and despite - # referencing audio group an audio group, it represents - # a complete (with audio and video) format. So, for such cases - # we will ignore references to rendition groups and treat them + # referencing an audio group it represents a complete + # (with audio and video) format. So, for such cases we will + # ignore references to rendition groups and treat them # as complete formats. if audio_group_id and codecs and f.get('vcodec') != 'none': audio_group = groups.get(audio_group_id) @@ -1768,9 +1889,7 @@ class InfoExtractor(object): 'height': height, }) formats.extend(m3u8_formats) - continue - - if src_ext == 'f4m': + elif src_ext == 'f4m': f4m_url = src_url if not f4m_params: f4m_params = { @@ -1780,9 +1899,13 @@ class InfoExtractor(object): f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) - continue - - if src_url.startswith('http') and self._is_valid_url(src, video_id): + elif src_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src_url, video_id, mpd_id='dash', fatal=False)) + elif re.search(r'\.ism/[Mm]anifest', src_url): + formats.extend(self._extract_ism_formats( + src_url, video_id, ism_id='mss', fatal=False)) + elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, @@ -1793,7 +1916,6 @@ class InfoExtractor(object): 'width': width, 'height': height, }) - continue return formats @@ -2015,7 +2137,21 @@ class InfoExtractor(object): representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): - t = representation_ms_info[template_name] + tmpl = representation_ms_info[template_name] + # First of, % characters outside $...$ templates + # must be escaped by doubling for proper processing + # by % operator string formatting used further (see + # https://github.com/rg3/youtube-dl/issues/16867). + t = '' + in_template = False + for c in tmpl: + t += c + if c == '$': + in_template = not in_template + elif c == '%' and not in_template: + t += c + # Next, $...$ templates are translated to their + # %(...) counterparts to be used with % operator t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) @@ -2346,6 +2482,8 @@ class InfoExtractor(object): media_info['subtitles'].setdefault(lang, []).append({ 'url': absolute_url(src), }) + for f in media_info['formats']: + f.setdefault('http_headers', {})['Referer'] = base_url if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries @@ -2495,7 +2633,7 @@ class InfoExtractor(object): 'id': this_video_id, 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, @@ -2522,12 +2660,9 @@ class InfoExtractor(object): for source in jwplayer_sources_data: if not isinstance(source, dict): continue - source_url = self._proto_relative_url(source.get('file')) - if not source_url: - continue - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - if source_url in urls: + source_url = urljoin( + base_url, self._proto_relative_url(source.get('file'))) + if not source_url or source_url in urls: continue urls.append(source_url) source_type = source.get('type') or '' diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index f4a616455..49bf3a4f9 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -1,19 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals, division +import hashlib +import hmac import re +import time from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_HTTPError from ..utils import ( determine_ext, float_or_none, int_or_none, parse_age_limit, parse_duration, + url_or_none, ExtractorError ) @@ -50,6 +51,21 @@ class CrackleIE(InfoExtractor): 'only_matching': True, }] + _MEDIA_FILE_SLOTS = { + '360p.mp4': { + 'width': 640, + 'height': 360, + }, + '480p.mp4': { + 'width': 768, + 'height': 432, + }, + '480p_1mbps.mp4': { + 'width': 852, + 'height': 480, + }, + } + def _real_extract(self, url): video_id = self._match_id(url) @@ -61,13 +77,16 @@ class CrackleIE(InfoExtractor): for country in countries: try: + # Authorization generation algorithm is reverse engineered from: + # https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js + media_detail_url = 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country) + timestamp = time.strftime('%Y%m%d%H%M', time.gmtime()) + h = hmac.new(b'IGSLUQCBDFHEOIFM', '|'.join([media_detail_url, timestamp]).encode(), hashlib.sha1).hexdigest().upper() media = self._download_json( - 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s' - % (video_id, country), video_id, - 'Downloading media JSON as %s' % country, - 'Unable to download media JSON', query={ - 'disableProtocols': 'true', - 'format': 'json' + media_detail_url, video_id, 'Downloading media JSON as %s' % country, + 'Unable to download media JSON', headers={ + 'Accept': 'application/json', + 'Authorization': '|'.join([h, timestamp, '117', '1']), }) except ExtractorError as e: # 401 means geo restriction, trying next country @@ -86,8 +105,8 @@ class CrackleIE(InfoExtractor): for e in media['MediaURLs']: if e.get('UseDRM') is True: continue - format_url = e.get('Path') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(e.get('Path')) + if not format_url: continue ext = determine_ext(format_url) if ext == 'm3u8': @@ -97,6 +116,20 @@ class CrackleIE(InfoExtractor): elif ext == 'mpd': formats.extend(self._extract_mpd_formats( format_url, video_id, mpd_id='dash', fatal=False)) + elif format_url.endswith('.ism/Manifest'): + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + mfs_path = e.get('Type') + mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path) + if not mfs_info: + continue + formats.append({ + 'url': format_url, + 'format_id': 'http-' + mfs_path.split('.')[0], + 'width': mfs_info['width'], + 'height': mfs_info['height'], + }) self._sort_formats(formats) description = media.get('Description') @@ -124,8 +157,8 @@ class CrackleIE(InfoExtractor): for cc_file in cc_files: if not isinstance(cc_file, dict): continue - cc_url = cc_file.get('Path') - if not cc_url or not isinstance(cc_url, compat_str): + cc_url = url_or_none(cc_file.get('Path')) + if not cc_url: continue lang = cc_file.get('Locale') or 'en' subtitles.setdefault(lang, []).append({'url': cc_url}) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 311da515d..5e2cbe41d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals import re import json +import xml.etree.ElementTree as etree import zlib from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor +from .vrv import VRVIE from ..compat import ( compat_b64decode, compat_etree_fromstring, @@ -18,6 +20,8 @@ from ..compat import ( from ..utils import ( ExtractorError, bytes_to_intlist, + extract_attributes, + float_or_none, intlist_to_bytes, int_or_none, lowercase_escape, @@ -26,7 +30,6 @@ from ..utils import ( unified_strdate, urlencode_postdata, xpath_text, - extract_attributes, ) from ..aes import ( aes_cbc_decrypt, @@ -43,7 +46,7 @@ class CrunchyrollBaseIE(InfoExtractor): data['req'] = 'RpcApi' + method data = compat_urllib_parse_urlencode(data).encode('utf-8') return self._download_xml( - 'http://www.crunchyroll.com/xml/', + 'https://www.crunchyroll.com/xml/', video_id, note, fatal=False, data=data, headers={ 'Content-Type': 'application/x-www-form-urlencoded', }) @@ -139,8 +142,9 @@ class CrunchyrollBaseIE(InfoExtractor): parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) -class CrunchyrollIE(CrunchyrollBaseIE): - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' +class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): + IE_NAME = 'crunchyroll' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { @@ -148,7 +152,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'ext': 'mp4', 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', 'description': 'md5:2d17137920c64f2f49981a7797d275ef', - 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Yomiuri Telecasting Corporation (YTV)', 'upload_date': '20131013', 'url': 're:(?!.*&)', @@ -221,7 +225,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'info_dict': { 'id': '535080', 'ext': 'mp4', - 'title': '11eyes Episode 1 – Piros éjszaka - Red Night', + 'title': '11eyes Episode 1 – Red Night ~ Piros éjszaka', 'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".', 'uploader': 'Marvelous AQL Inc.', 'upload_date': '20091021', @@ -262,6 +266,12 @@ class CrunchyrollIE(CrunchyrollBaseIE): # Just test metadata extraction 'skip_download': True, }, + }, { + 'url': 'http://www.crunchyroll.com/media-723735', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921', + 'only_matching': True, }] _FORMAT_IDS = { @@ -392,7 +402,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'Downloading subtitles for ' + sub_name, data={ 'subtitle_script_id': sub_id, }) - if sub_doc is None: + if not isinstance(sub_doc, etree.Element): continue sid = sub_doc.get('id') iv = xpath_text(sub_doc, 'iv', 'subtitle iv') @@ -434,13 +444,22 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if 'To view this, please log in to verify you are 18 or older.' in webpage: self.raise_login_required() + media = self._parse_json(self._search_regex( + r'vilos\.config\.media\s*=\s*({.+?});', + webpage, 'vilos media', default='{}'), video_id) + media_metadata = media.get('metadata') or {} + + language = self._search_regex( + r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P<lang>(?:(?!\1).)+)\1', + webpage, 'language', default=None, group='lang') + video_title = self._html_search_regex( r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) - video_description = self._parse_json(self._html_search_regex( + video_description = (self._parse_json(self._html_search_regex( r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, - webpage, 'description', default='{}'), video_id).get('description') + webpage, 'description', default='{}'), video_id) or media_metadata).get('description') if video_description: video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_upload_date = self._html_search_regex( @@ -453,92 +472,113 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], webpage, 'video_uploader', fatal=False) - available_fmts = [] - for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): - attrs = extract_attributes(a) - href = attrs.get('href') - if href and '/freetrial' in href: - continue - available_fmts.append(fmt) - if not available_fmts: - for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): - available_fmts = re.findall(p, webpage) - if available_fmts: - break - video_encode_ids = [] formats = [] - for fmt in available_fmts: - stream_quality, stream_format = self._FORMAT_IDS[fmt] - video_format = fmt + 'p' - stream_infos = [] - streamdata = self._call_rpc_api( - 'VideoPlayer_GetStandardConfig', video_id, - 'Downloading media info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_quality': stream_quality, - 'current_page': url, - }) - if streamdata is not None: - stream_info = streamdata.find('./{default}preload/stream_info') - if stream_info is not None: + for stream in media.get('streams', []): + audio_lang = stream.get('audio_lang') + hardsub_lang = stream.get('hardsub_lang') + vrv_formats = self._extract_vrv_formats( + stream.get('url'), video_id, stream.get('format'), + audio_lang, hardsub_lang) + for f in vrv_formats: + if not hardsub_lang: + f['preference'] = 1 + language_preference = 0 + if audio_lang == language: + language_preference += 1 + if hardsub_lang == language: + language_preference += 1 + if language_preference: + f['language_preference'] = language_preference + formats.extend(vrv_formats) + if not formats: + available_fmts = [] + for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): + attrs = extract_attributes(a) + href = attrs.get('href') + if href and '/freetrial' in href: + continue + available_fmts.append(fmt) + if not available_fmts: + for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): + available_fmts = re.findall(p, webpage) + if available_fmts: + break + if not available_fmts: + available_fmts = self._FORMAT_IDS.keys() + video_encode_ids = [] + + for fmt in available_fmts: + stream_quality, stream_format = self._FORMAT_IDS[fmt] + video_format = fmt + 'p' + stream_infos = [] + streamdata = self._call_rpc_api( + 'VideoPlayer_GetStandardConfig', video_id, + 'Downloading media info for %s' % video_format, data={ + 'media_id': video_id, + 'video_format': stream_format, + 'video_quality': stream_quality, + 'current_page': url, + }) + if isinstance(streamdata, etree.Element): + stream_info = streamdata.find('./{default}preload/stream_info') + if stream_info is not None: + stream_infos.append(stream_info) + stream_info = self._call_rpc_api( + 'VideoEncode_GetStreamInfo', video_id, + 'Downloading stream info for %s' % video_format, data={ + 'media_id': video_id, + 'video_format': stream_format, + 'video_encode_quality': stream_quality, + }) + if isinstance(stream_info, etree.Element): stream_infos.append(stream_info) - stream_info = self._call_rpc_api( - 'VideoEncode_GetStreamInfo', video_id, - 'Downloading stream info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_encode_quality': stream_quality, - }) - if stream_info is not None: - stream_infos.append(stream_info) - for stream_info in stream_infos: - video_encode_id = xpath_text(stream_info, './video_encode_id') - if video_encode_id in video_encode_ids: - continue - video_encode_ids.append(video_encode_id) + for stream_info in stream_infos: + video_encode_id = xpath_text(stream_info, './video_encode_id') + if video_encode_id in video_encode_ids: + continue + video_encode_ids.append(video_encode_id) - video_file = xpath_text(stream_info, './file') - if not video_file: - continue - if video_file.startswith('http'): - formats.extend(self._extract_m3u8_formats( - video_file, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - - video_url = xpath_text(stream_info, './host') - if not video_url: - continue - metadata = stream_info.find('./metadata') - format_info = { - 'format': video_format, - 'height': int_or_none(xpath_text(metadata, './height')), - 'width': int_or_none(xpath_text(metadata, './width')), - } - - if '.fplive.net/' in video_url: - video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) - parsed_video_url = compat_urlparse.urlparse(video_url) - direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( - netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) - if self._is_valid_url(direct_video_url, video_id, video_format): - format_info.update({ - 'format_id': 'http-' + video_format, - 'url': direct_video_url, - }) - formats.append(format_info) + video_file = xpath_text(stream_info, './file') + if not video_file: + continue + if video_file.startswith('http'): + formats.extend(self._extract_m3u8_formats( + video_file, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) continue - format_info.update({ - 'format_id': 'rtmp-' + video_format, - 'url': video_url, - 'play_path': video_file, - 'ext': 'flv', - }) - formats.append(format_info) - self._sort_formats(formats, ('height', 'width', 'tbr', 'fps')) + video_url = xpath_text(stream_info, './host') + if not video_url: + continue + metadata = stream_info.find('./metadata') + format_info = { + 'format': video_format, + 'height': int_or_none(xpath_text(metadata, './height')), + 'width': int_or_none(xpath_text(metadata, './width')), + } + + if '.fplive.net/' in video_url: + video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) + parsed_video_url = compat_urlparse.urlparse(video_url) + direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( + netloc='v.lvlt.crcdn.net', + path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) + if self._is_valid_url(direct_video_url, video_id, video_format): + format_info.update({ + 'format_id': 'http-' + video_format, + 'url': direct_video_url, + }) + formats.append(format_info) + continue + + format_info.update({ + 'format_id': 'rtmp-' + video_format, + 'url': video_url, + 'play_path': video_file, + 'ext': 'flv', + }) + formats.append(format_info) + self._sort_formats(formats, ('preference', 'language_preference', 'height', 'width', 'tbr', 'fps')) metadata = self._call_rpc_api( 'VideoPlayer_GetMediaMetadata', video_id, @@ -546,16 +586,38 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'media_id': video_id, }) - subtitles = self.extract_subtitles(video_id, webpage) + subtitles = {} + for subtitle in media.get('subtitles', []): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) + if not subtitles: + subtitles = self.extract_subtitles(video_id, webpage) # webpage provide more accurate data than series_title from XML series = self._html_search_regex( r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d', webpage, 'series', fatal=False) - season = xpath_text(metadata, 'series_title') - episode = xpath_text(metadata, 'episode_title') - episode_number = int_or_none(xpath_text(metadata, 'episode_number')) + season = episode = episode_number = duration = thumbnail = None + + if isinstance(metadata, etree.Element): + season = xpath_text(metadata, 'series_title') + episode = xpath_text(metadata, 'episode_title') + episode_number = int_or_none(xpath_text(metadata, 'episode_number')) + duration = float_or_none(media_metadata.get('duration'), 1000) + thumbnail = xpath_text(metadata, 'episode_image_url') + + if not episode: + episode = media_metadata.get('title') + if not episode_number: + episode_number = int_or_none(media_metadata.get('episode_number')) + if not thumbnail: + thumbnail = media_metadata.get('thumbnail', {}).get('url') season_number = int_or_none(self._search_regex( r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', @@ -565,7 +627,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'id': video_id, 'title': video_title, 'description': video_description, - 'thumbnail': xpath_text(metadata, 'episode_image_url'), + 'duration': duration, + 'thumbnail': thumbnail, 'uploader': video_uploader, 'upload_date': video_upload_date, 'series': series, @@ -580,7 +643,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = 'crunchyroll:playlist' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index 35b1e7a34..e4a7fca6c 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -46,8 +46,24 @@ class CuriosityStreamBaseIE(InfoExtractor): self._handle_errors(result) self._auth_token = result['message']['auth_token'] - def _extract_media_info(self, media): - video_id = compat_str(media['id']) + +class CuriosityStreamIE(CuriosityStreamBaseIE): + IE_NAME = 'curiositystream' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' + _TEST = { + 'url': 'https://app.curiositystream.com/video/2', + 'md5': '262bb2f257ff301115f1973540de8983', + 'info_dict': { + 'id': '2', + 'ext': 'mp4', + 'title': 'How Did You Develop The Internet?', + 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media = self._call_api('media/' + video_id, video_id) title = media['title'] formats = [] @@ -114,38 +130,21 @@ class CuriosityStreamBaseIE(InfoExtractor): } -class CuriosityStreamIE(CuriosityStreamBaseIE): - IE_NAME = 'curiositystream' - _VALID_URL = r'https?://app\.curiositystream\.com/video/(?P<id>\d+)' - _TEST = { - 'url': 'https://app.curiositystream.com/video/2', - 'md5': '262bb2f257ff301115f1973540de8983', - 'info_dict': { - 'id': '2', - 'ext': 'mp4', - 'title': 'How Did You Develop The Internet?', - 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - media = self._call_api('media/' + video_id, video_id) - return self._extract_media_info(media) - - class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://app\.curiositystream\.com/collection/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collection|series)/(?P<id>\d+)' + _TESTS = [{ 'url': 'https://app.curiositystream.com/collection/2', 'info_dict': { 'id': '2', 'title': 'Curious Minds: The Internet', 'description': 'How is the internet shaping our lives in the 21st Century?', }, - 'playlist_mincount': 12, - } + 'playlist_mincount': 17, + }, { + 'url': 'https://curiositystream.com/series/2', + 'only_matching': True, + }] def _real_extract(self, url): collection_id = self._match_id(url) @@ -153,7 +152,10 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): 'collections/' + collection_id, collection_id) entries = [] for media in collection.get('media', []): - entries.append(self._extract_media_info(media)) + media_id = compat_str(media.get('id')) + entries.append(self.url_result( + 'https://curiositystream.com/video/' + media_id, + CuriosityStreamIE.ie_key(), media_id)) return self.playlist_result( entries, collection_id, collection.get('title'), collection.get('description')) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index f4cf0f1c5..f9bd535f6 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -3,8 +3,12 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, + parse_age_limit, parse_iso8601, + smuggle_url, + str_or_none, ) @@ -40,10 +44,15 @@ class CWTVIE(InfoExtractor): 'duration': 1263, 'series': 'Whose Line Is It Anyway?', 'season_number': 11, - 'season': '11', 'episode_number': 20, 'upload_date': '20151006', 'timestamp': 1444107300, + 'age_limit': 14, + 'uploader': 'CWTV', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, { 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', @@ -58,60 +67,31 @@ class CWTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = None - formats = [] - for partner in (154, 213): - vdata = self._download_json( - 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/%d?format=json' % (video_id, partner), video_id, fatal=False) - if not vdata: - continue - video_data = vdata - for quality, quality_data in vdata.get('videos', {}).items(): - quality_url = quality_data.get('uri') - if not quality_url: - continue - if quality == 'variantplaylist': - formats.extend(self._extract_m3u8_formats( - quality_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - else: - tbr = int_or_none(quality_data.get('bitrate')) - format_id = 'http' + ('-%d' % tbr if tbr else '') - if self._is_valid_url(quality_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': quality_url, - 'tbr': tbr, - }) - video_metadata = video_data['assetFields'] - ism_url = video_metadata.get('smoothStreamingUrl') - if ism_url: - formats.extend(self._extract_ism_formats( - ism_url, video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) + data = self._download_json( + 'http://images.cwtv.com/feed/mobileapp/video-meta/apiversion_8/guid_' + video_id, + video_id) + if data.get('result') != 'ok': + raise ExtractorError(data['msg'], expected=True) + video_data = data['video'] + title = video_data['title'] + mpx_url = video_data.get('mpx_url') or 'http://link.theplatform.com/s/cwtv/media/guid/2703454149/%s?formats=M3U' % video_id - thumbnails = [{ - 'url': image['uri'], - 'width': image.get('width'), - 'height': image.get('height'), - } for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None - - subtitles = { - 'en': [{ - 'url': video_metadata['UnicornCcUrl'], - }], - } if video_metadata.get('UnicornCcUrl') else None + season = str_or_none(video_data.get('season')) + episode = str_or_none(video_data.get('episode')) + if episode and season: + episode = episode.lstrip(season) return { + '_type': 'url_transparent', 'id': video_id, - 'title': video_metadata['title'], - 'description': video_metadata.get('description'), - 'duration': int_or_none(video_metadata.get('duration')), - 'series': video_metadata.get('seriesName'), - 'season_number': int_or_none(video_metadata.get('seasonNumber')), - 'season': video_metadata.get('seasonName'), - 'episode_number': int_or_none(video_metadata.get('episodeNumber')), - 'timestamp': parse_iso8601(video_data.get('startTime')), - 'thumbnails': thumbnails, - 'formats': formats, - 'subtitles': subtitles, + 'title': title, + 'url': smuggle_url(mpx_url, {'force_smil_url': True}), + 'description': video_data.get('description_long'), + 'duration': int_or_none(video_data.get('duration_secs')), + 'series': video_data.get('series_name'), + 'season_number': int_or_none(season), + 'episode_number': int_or_none(episode), + 'timestamp': parse_iso8601(video_data.get('start_time')), + 'age_limit': parse_age_limit(video_data.get('rating')), + 'ie_key': 'ThePlatform', } diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index af3978035..4f75a2a30 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -49,6 +49,9 @@ class DailyMailIE(InfoExtractor): 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id) video_sources = self._download_json(sources_url, video_id) + body = video_sources.get('body') + if body: + video_sources = body formats = [] for rendition in video_sources['renditions']: diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index de27fffd4..1816c559e 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,22 +1,32 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json +import base64 +import functools +import hashlib import itertools +import json +import random +import re +import string from .common import InfoExtractor - +from ..compat import compat_struct_pack from ..utils import ( determine_ext, error_to_compat_str, ExtractorError, int_or_none, + mimetype2ext, + OnDemandPagedList, parse_iso8601, sanitized_Request, str_to_int, + try_get, unescapeHTML, - mimetype2ext, + update_url_query, + url_or_none, + urlencode_postdata, ) @@ -64,7 +74,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'Deadline', 'uploader_id': 'x1xm8ri', 'age_limit': 0, - 'view_count': int, }, }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', @@ -141,7 +150,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): age_limit = self._rta_search(webpage) - description = self._og_search_description(webpage) or self._html_search_meta( + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( 'description', webpage, 'description') view_count_str = self._search_regex( @@ -164,8 +174,34 @@ class DailymotionIE(DailymotionBaseInfoExtractor): r'__PLAYER_CONFIG__\s*=\s*({.+?});'], webpage, 'player v5', default=None) if player_v5: - player = self._parse_json(player_v5, video_id) - metadata = player['metadata'] + player = self._parse_json(player_v5, video_id, fatal=False) or {} + metadata = try_get(player, lambda x: x['metadata'], dict) + if not metadata: + metadata_url = url_or_none(try_get( + player, lambda x: x['context']['metadata_template_url1'])) + if metadata_url: + metadata_url = metadata_url.replace(':videoId', video_id) + else: + metadata_url = update_url_query( + 'https://www.dailymotion.com/player/metadata/video/%s' + % video_id, { + 'embedder': url, + 'integration': 'inline', + 'GK_PV5_NEON': '1', + }) + metadata = self._download_json( + metadata_url, video_id, 'Downloading metadata JSON') + + if try_get(metadata, lambda x: x['error']['type']) == 'password_protected': + password = self._downloader.params.get('videopassword') + if password: + r = int(metadata['id'][1:], 36) + us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=') + t = ''.join(random.choice(string.ascii_letters) for i in range(10)) + n = us64e(compat_struct_pack('I', r)) + i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest()) + metadata = self._download_json( + 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id) self._check_error(metadata) @@ -302,8 +338,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _check_error(self, info): error = info.get('error') - if info.get('error') is not None: - title = error['title'] + if error: + title = error.get('title') or error['message'] # See https://developer.dailymotion.com/api#access-error if error.get('code') == 'DM007': self.raise_geo_restricted(msg=title) @@ -328,17 +364,93 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)' - _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' - _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)' _TESTS = [{ 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', 'info_dict': { 'title': 'SPORT', - 'id': 'xv4bw_nqtv_sport', + 'id': 'xv4bw', }, 'playlist_mincount': 20, }] + _PAGE_SIZE = 100 + + def _fetch_page(self, playlist_id, authorizaion, page): + page += 1 + videos = self._download_json( + 'https://graphql.api.dailymotion.com', + playlist_id, 'Downloading page %d' % page, + data=json.dumps({ + 'query': '''{ + collection(xid: "%s") { + videos(first: %d, page: %d) { + pageInfo { + hasNextPage + nextPage + } + edges { + node { + xid + url + } + } + } + } +}''' % (playlist_id, self._PAGE_SIZE, page) + }).encode(), headers={ + 'Authorization': authorizaion, + 'Origin': 'https://www.dailymotion.com', + })['data']['collection']['videos'] + for edge in videos['edges']: + node = edge['node'] + yield self.url_result( + node['url'], DailymotionIE.ie_key(), node['xid']) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + api = self._parse_json(self._search_regex( + r'__PLAYER_CONFIG__\s*=\s*({.+?});', + webpage, 'player config'), playlist_id)['context']['api'] + auth = self._download_json( + api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'), + playlist_id, data=urlencode_postdata({ + 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'), + 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'), + 'grant_type': 'client_credentials', + })) + authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token']) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE) + return self.playlist_result( + entries, playlist_id, + self._og_search_title(webpage)) + + +class DailymotionUserIE(DailymotionBaseInfoExtractor): + IE_NAME = 'dailymotion:user' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' + _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' + _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' + _TESTS = [{ + 'url': 'https://www.dailymotion.com/user/nqtv', + 'info_dict': { + 'id': 'nqtv', + 'title': 'Rémi Gaillard', + }, + 'playlist_mincount': 100, + }, { + 'url': 'http://www.dailymotion.com/user/UnderProject', + 'info_dict': { + 'id': 'UnderProject', + 'title': 'UnderProject', + }, + 'playlist_mincount': 1800, + 'expected_warnings': [ + 'Stopped at duplicated page', + ], + 'skip': 'Takes too long time', + }] def _extract_entries(self, id): video_ids = set() @@ -364,43 +476,6 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: break - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - webpage = self._download_webpage(url, playlist_id) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': self._og_search_title(webpage), - 'entries': self._extract_entries(playlist_id), - } - - -class DailymotionUserIE(DailymotionPlaylistIE): - IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' - _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' - _TESTS = [{ - 'url': 'https://www.dailymotion.com/user/nqtv', - 'info_dict': { - 'id': 'nqtv', - 'title': 'Rémi Gaillard', - }, - 'playlist_mincount': 100, - }, { - 'url': 'http://www.dailymotion.com/user/UnderProject', - 'info_dict': { - 'id': 'UnderProject', - 'title': 'UnderProject', - }, - 'playlist_mincount': 1800, - 'expected_warnings': [ - 'Stopped at duplicated page', - ], - 'skip': 'Takes too long time', - }] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 3a6d0560e..769a219df 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -5,13 +5,16 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, - unified_strdate, + int_or_none, + unified_timestamp, + url_or_none, ) class DctpTvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ + # 4x3 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', @@ -19,37 +22,55 @@ class DctpTvIE(InfoExtractor): 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', - 'upload_date': '20110407', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 71.24, + 'timestamp': 1302172322, + 'upload_date': '20110407', }, 'params': { # rtmp download 'skip_download': True, }, - } + }, { + # 16x9 + 'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/', + 'only_matching': True, + }] + + _BASE_URL = 'http://dctp-ivms2-restapi.s3.amazonaws.com' def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + version = self._download_json( + '%s/version.json' % self._BASE_URL, display_id, + 'Downloading version JSON') - video_id = self._html_search_meta( - 'DC.identifier', webpage, 'video id', - default=None) or self._search_regex( - r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id') + restapi_base = '%s/%s/restapi' % ( + self._BASE_URL, version['version_name']) - title = self._og_search_title(webpage) + info = self._download_json( + '%s/slugs/%s.json' % (restapi_base, display_id), display_id, + 'Downloading video info JSON') + + media = self._download_json( + '%s/media/%s.json' % (restapi_base, compat_str(info['object_id'])), + display_id, 'Downloading media JSON') + + uuid = media['uuid'] + title = media['title'] + ratio = '16x9' if media.get('is_wide') else '4x3' + play_path = 'mp4:%s_dctp_0500_%s.m4v' % (uuid, ratio) servers = self._download_json( 'http://www.dctp.tv/streaming_servers/', display_id, - note='Downloading server list', fatal=False) + note='Downloading server list JSON', fatal=False) if servers: endpoint = next( server['endpoint'] for server in servers - if isinstance(server.get('endpoint'), compat_str) and + if url_or_none(server.get('endpoint')) and 'cloudfront' in server['endpoint']) else: endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' @@ -60,27 +81,35 @@ class DctpTvIE(InfoExtractor): formats = [{ 'url': endpoint, 'app': app, - 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, + 'play_path': play_path, 'page_url': url, - 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-110.swf', 'ext': 'flv', }] - description = self._html_search_meta('DC.description', webpage) - upload_date = unified_strdate( - self._html_search_meta('DC.date.created', webpage)) - thumbnail = self._og_search_thumbnail(webpage) - duration = float_or_none(self._search_regex( - r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', - default=None), scale=1000) + thumbnails = [] + images = media.get('images') + if isinstance(images, list): + for image in images: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) return { - 'id': video_id, - 'title': title, - 'formats': formats, + 'id': uuid, 'display_id': display_id, - 'description': description, - 'upload_date': upload_date, - 'thumbnail': thumbnail, - 'duration': duration, + 'title': title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description') or media.get('teaser'), + 'timestamp': unified_timestamp(media.get('created')), + 'duration': float_or_none(media.get('duration_in_ms'), scale=1000), + 'thumbnails': thumbnails, + 'formats': formats, } diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 3589bd428..b70c307a7 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -17,16 +17,29 @@ from ..compat import compat_HTTPError class DiscoveryIE(DiscoveryGoBaseIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site> - discovery| - investigationdiscovery| - discoverylife| - animalplanet| - ahctv| - destinationamerica| - sciencechannel| - tlc| - velocity + _VALID_URL = r'''(?x)https?:// + (?P<site> + (?:www\.)? + (?: + discovery| + investigationdiscovery| + discoverylife| + animalplanet| + ahctv| + destinationamerica| + sciencechannel| + tlc| + velocity + )| + watch\. + (?: + hgtv| + foodnetwork| + travelchannel| + diynetwork| + cookingchanneltv| + motortrend + ) )\.com(?P<path>/tv-shows/[^/]+/(?:video|full-episode)s/(?P<id>[^./?#]+))''' _TESTS = [{ 'url': 'https://www.discovery.com/tv-shows/cash-cab/videos/dave-foley', @@ -71,7 +84,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): if not access_token: access_token = self._download_json( - 'https://www.%s.com/anonymous' % site, display_id, query={ + 'https://%s.com/anonymous' % site, display_id, query={ 'authRel': 'authorization', 'client_id': try_get( react_data, lambda x: x['application']['apiClientId'], @@ -81,11 +94,12 @@ class DiscoveryIE(DiscoveryGoBaseIE): })['access_token'] try: + headers = self.geo_verification_headers() + headers['Authorization'] = 'Bearer ' + access_token + stream = self._download_json( 'https://api.discovery.com/v1/streaming/video/' + video_id, - display_id, headers={ - 'Authorization': 'Bearer ' + access_token, - }) + display_id, headers=headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): e_description = self._parse_json( diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index 3368c4c07..9e7b14a7d 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, extract_attributes, @@ -12,6 +11,7 @@ from ..utils import ( parse_age_limit, remove_end, unescapeHTML, + url_or_none, ) @@ -69,9 +69,8 @@ class DiscoveryGoBaseIE(InfoExtractor): captions = stream.get('captions') if isinstance(captions, list): for caption in captions: - subtitle_url = caption.get('fileUrl') - if (not subtitle_url or not isinstance(subtitle_url, compat_str) or - not subtitle_url.startswith('http')): + subtitle_url = url_or_none(caption.get('fileUrl')) + if not subtitle_url or not subtitle_url.startswith('http'): continue lang = caption.get('fileLang', 'en') ext = determine_ext(subtitle_url) diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py index b6653784c..fba1ef221 100644 --- a/youtube_dl/extractor/discoverynetworks.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE +from .dplay import DPlayIE from ..compat import ( compat_parse_qs, compat_urlparse, @@ -12,8 +12,13 @@ from ..compat import ( from ..utils import smuggle_url -class DiscoveryNetworksDeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P<id>\d+)|(?:[^/]+/)*videos/(?P<title>[^/?#]+))' +class DiscoveryNetworksDeIE(DPlayIE): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>discovery|tlc|animalplanet|dmax)\.de/ + (?: + .*\#(?P<id>\d+)| + (?:[^/]+/)*videos/(?P<display_id>[^/?#]+)| + programme/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+) + )''' _TESTS = [{ 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', @@ -40,6 +45,14 @@ class DiscoveryNetworksDeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + alternate_id = mobj.group('alternate_id') + if alternate_id: + self._initialize_geo_bypass({ + 'countries': ['DE'], + }) + return self._get_disco_api_info( + url, '%s/%s' % (mobj.group('programme'), alternate_id), + 'sonic-eu1-prod.disco-api.com', mobj.group('site') + 'de') brightcove_id = mobj.group('id') if not brightcove_id: title = mobj.group('title') diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 8e0374320..ebf59512c 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -21,6 +21,7 @@ from ..utils import ( unified_strdate, unified_timestamp, update_url_query, + urljoin, USER_AGENTS, ) @@ -97,6 +98,75 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] + def _get_disco_api_info(self, url, display_id, disco_host, realm): + disco_base = 'https://' + disco_host + token = self._download_json( + '%s/token' % disco_base, display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + headers = { + 'Referer': url, + 'Authorization': 'Bearer ' + token, + } + video = self._download_json( + '%s/content/videos/%s' % (disco_base, display_id), display_id, + headers=headers, query={ + 'include': 'show' + }) + video_id = video['data']['id'] + info = video['data']['attributes'] + title = info['name'] + formats = [] + for format_id, format_dict in self._download_json( + '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), + display_id, headers=headers)['data']['attributes']['streaming'].items(): + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + if format_id == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, display_id, mpd_id='dash', fatal=False)) + elif format_id == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + series = None + try: + included = video.get('included') + if isinstance(included, list): + show = next(e for e in included if e.get('type') == 'show') + series = try_get( + show, lambda x: x['attributes']['name'], compat_str) + except StopIteration: + pass + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': info.get('description'), + 'duration': float_or_none( + info.get('videoDuration'), scale=1000), + 'timestamp': unified_timestamp(info.get('publishStart')), + 'series': series, + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode_number': int_or_none(info.get('episodeNumber')), + 'age_limit': int_or_none(info.get('minimum_age')), + 'formats': formats, + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') @@ -113,72 +183,8 @@ class DPlayIE(InfoExtractor): if not video_id: host = mobj.group('host') - disco_base = 'https://disco-api.%s' % host - self._download_json( - '%s/token' % disco_base, display_id, 'Downloading token', - query={ - 'realm': host.replace('.', ''), - }) - video = self._download_json( - '%s/content/videos/%s' % (disco_base, display_id), display_id, - headers={ - 'Referer': url, - 'x-disco-client': 'WEB:UNKNOWN:dplay-client:0.0.1', - }, query={ - 'include': 'show' - }) - video_id = video['data']['id'] - info = video['data']['attributes'] - title = info['name'] - formats = [] - for format_id, format_dict in self._download_json( - '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), - display_id)['data']['attributes']['streaming'].items(): - if not isinstance(format_dict, dict): - continue - format_url = format_dict.get('url') - if not format_url: - continue - ext = determine_ext(format_url) - if format_id == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id='dash', fatal=False)) - elif format_id == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - series = None - try: - included = video.get('included') - if isinstance(included, list): - show = next(e for e in included if e.get('type') == 'show') - series = try_get( - show, lambda x: x['attributes']['name'], compat_str) - except StopIteration: - pass - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': info.get('description'), - 'duration': float_or_none( - info.get('videoDuration'), scale=1000), - 'timestamp': unified_timestamp(info.get('publishStart')), - 'series': series, - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode_number': int_or_none(info.get('episodeNumber')), - 'age_limit': int_or_none(info.get('minimum_age')), - 'formats': formats, - } + return self._get_disco_api_info( + url, display_id, 'disco-api.' + host, host.replace('.', '')) info = self._download_json( 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), @@ -305,9 +311,11 @@ class DPlayItIE(InfoExtractor): if not info: info_url = self._search_regex( - r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', - webpage, 'info url') + (r'playback_json_url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + r'url\s*[:=]\s*["\'](?P<url>(?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)'), + webpage, 'info url', group='url') + info_url = urljoin(url, info_url) video_id = info_url.rpartition('/')[-1] try: @@ -317,6 +325,8 @@ class DPlayItIE(InfoExtractor): 'dplayit_token').value, 'Referer': url, }) + if isinstance(info, compat_str): + info = self._parse_json(info, display_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): info = self._parse_json(e.cause.read().decode('utf-8'), display_id) @@ -332,6 +342,7 @@ class DPlayItIE(InfoExtractor): formats = self._extract_m3u8_formats( hls_url, display_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) series = self._html_search_regex( r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>', diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index ab32ba4ff..db1de699f 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -7,7 +7,6 @@ import json from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_str, compat_urlparse, ) from ..utils import ( @@ -17,6 +16,7 @@ from ..utils import ( parse_age_limit, parse_duration, unified_timestamp, + url_or_none, ) @@ -139,8 +139,8 @@ class DramaFeverIE(DramaFeverBaseIE): for sub in subs: if not isinstance(sub, dict): continue - sub_url = sub.get('url') - if not sub_url or not isinstance(sub_url, compat_str): + sub_url = url_or_none(sub.get('url')) + if not sub_url: continue subtitles.setdefault( sub.get('code') or sub.get('language') or 'en', []).append({ @@ -163,8 +163,8 @@ class DramaFeverIE(DramaFeverBaseIE): for format_id, format_dict in download_assets.items(): if not isinstance(format_dict, dict): continue - format_url = format_dict.get('url') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_dict.get('url')) + if not format_url: continue formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 5c41c8022..2baea585b 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -4,7 +4,9 @@ import re from .common import InfoExtractor from ..utils import ( + int_or_none, NO_DEFAULT, + parse_duration, str_to_int, ) @@ -65,6 +67,9 @@ class DrTuberIE(InfoExtractor): }) self._sort_formats(formats) + duration = int_or_none(video_data.get('duration')) or parse_duration( + video_data.get('duration_format')) + title = self._html_search_regex( (r'<h1[^>]+class=["\']title[^>]+>([^<]+)', r'<title>([^<]+)\s*@\s+DrTuber', @@ -103,4 +108,5 @@ class DrTuberIE(InfoExtractor): 'comment_count': comment_count, 'categories': categories, 'age_limit': self._rta_search(webpage), + 'duration': duration, } diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index f757745ba..0c7e350f0 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,15 +1,25 @@ # coding: utf-8 from __future__ import unicode_literals +import binascii +import hashlib +import re + + from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import compat_urllib_parse_unquote from ..utils import ( + bytes_to_intlist, ExtractorError, int_or_none, + intlist_to_bytes, float_or_none, mimetype2ext, - parse_iso8601, - remove_end, + str_or_none, + unified_timestamp, update_url_query, + url_or_none, ) @@ -20,23 +30,31 @@ class DRTVIE(InfoExtractor): IE_NAME = 'drtv' _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', - 'md5': '7ae17b4e18eb5d29212f424a7511c184', + 'md5': '25e659cccc9a2ed956110a299fdf5983', 'info_dict': { 'id': 'klassen-darlig-taber-10', 'ext': 'mp4', 'title': 'Klassen - Dårlig taber (10)', 'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa', - 'timestamp': 1471991907, - 'upload_date': '20160823', + 'timestamp': 1539085800, + 'upload_date': '20181009', 'duration': 606.84, + 'series': 'Klassen', + 'season': 'Klassen I', + 'season_number': 1, + 'season_id': 'urn:dr:mu:bundle:57d7e8216187a4031cfd6f6b', + 'episode': 'Episode 10', + 'episode_number': 10, + 'release_year': 2016, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # embed 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', 'info_dict': { - 'id': 'christiania-pusher-street-ryddes-drdkrjpo', + 'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6', 'ext': 'mp4', - 'title': 'LIVE Christianias rydning af Pusher Street er i gang', + 'title': 'christiania pusher street ryddes drdkrjpo', 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5', 'timestamp': 1472800279, 'upload_date': '20160902', @@ -45,17 +63,18 @@ class DRTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # with SignLanguage formats 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', 'info_dict': { 'id': 'historien-om-danmark-stenalder', 'ext': 'mp4', - 'title': 'Historien om Danmark: Stenalder (1)', + 'title': 'Historien om Danmark: Stenalder', 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', - 'timestamp': 1490401996, - 'upload_date': '20170325', - 'duration': 3502.04, + 'timestamp': 1546628400, + 'upload_date': '20190104', + 'duration': 3502.56, 'formats': 'mincount:20', }, 'params': { @@ -74,20 +93,26 @@ class DRTVIE(InfoExtractor): video_id = self._search_regex( (r'data-(?:material-identifier|episode-slug)="([^"]+)"', - r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), - webpage, 'video id') + r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), + webpage, 'video id', default=None) - programcard = self._download_json( - 'http://www.dr.dk/mu/programcard/expanded/%s' % video_id, - video_id, 'Downloading video JSON') - data = programcard['Data'][0] + if not video_id: + video_id = compat_urllib_parse_unquote(self._search_regex( + r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)', + webpage, 'urn')) - title = remove_end(self._og_search_title( - webpage, default=None), ' | TV | DR') or data['Title'] + data = self._download_json( + 'https://www.dr.dk/mu-online/api/1.4/programcard/%s' % video_id, + video_id, 'Downloading video JSON', query={'expanded': 'true'}) + + title = str_or_none(data.get('Title')) or re.sub( + r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '', + self._og_search_title(webpage)) description = self._og_search_description( webpage, default=None) or data.get('Description') - timestamp = parse_iso8601(data.get('CreatedTime')) + timestamp = unified_timestamp( + data.get('PrimaryBroadcastStartTime') or data.get('SortDateTime')) thumbnail = None duration = None @@ -97,24 +122,62 @@ class DRTVIE(InfoExtractor): formats = [] subtitles = {} - for asset in data['Assets']: + assets = [] + primary_asset = data.get('PrimaryAsset') + if isinstance(primary_asset, dict): + assets.append(primary_asset) + secondary_assets = data.get('SecondaryAssets') + if isinstance(secondary_assets, list): + for secondary_asset in secondary_assets: + if isinstance(secondary_asset, dict): + assets.append(secondary_asset) + + def hex_to_bytes(hex): + return binascii.a2b_hex(hex.encode('ascii')) + + def decrypt_uri(e): + n = int(e[2:10], 16) + a = e[10 + n:] + data = bytes_to_intlist(hex_to_bytes(e[10:10 + n])) + key = bytes_to_intlist(hashlib.sha256( + ('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest()) + iv = bytes_to_intlist(hex_to_bytes(a)) + decrypted = aes_cbc_decrypt(data, key, iv) + return intlist_to_bytes( + decrypted[:-decrypted[-1]]).decode('utf-8').split('?')[0] + + for asset in assets: kind = asset.get('Kind') if kind == 'Image': - thumbnail = asset.get('Uri') + thumbnail = url_or_none(asset.get('Uri')) elif kind in ('VideoResource', 'AudioResource'): duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) restricted_to_denmark = asset.get('RestrictedToDenmark') asset_target = asset.get('Target') for link in asset.get('Links', []): uri = link.get('Uri') + if not uri: + encrypted_uri = link.get('EncryptedUri') + if not encrypted_uri: + continue + try: + uri = decrypt_uri(encrypted_uri) + except Exception: + self.report_warning( + 'Unable to decrypt EncryptedUri', video_id) + continue + uri = url_or_none(uri) if not uri: continue target = link.get('Target') format_id = target or '' - preference = None - if asset_target in ('SpokenSubtitles', 'SignLanguage'): + if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'): preference = -1 format_id += '-%s' % asset_target + elif asset_target == 'Default': + preference = 1 + else: + preference = None if target == 'HDS': f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', @@ -140,19 +203,22 @@ class DRTVIE(InfoExtractor): 'vcodec': 'none' if kind == 'AudioResource' else None, 'preference': preference, }) - subtitles_list = asset.get('SubtitlesList') - if isinstance(subtitles_list, list): - LANGS = { - 'Danish': 'da', - } - for subs in subtitles_list: - if not subs.get('Uri'): - continue - lang = subs.get('Language') or 'da' - subtitles.setdefault(LANGS.get(lang, lang), []).append({ - 'url': subs['Uri'], - 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' - }) + subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist') + if isinstance(subtitles_list, list): + LANGS = { + 'Danish': 'da', + } + for subs in subtitles_list: + if not isinstance(subs, dict): + continue + sub_uri = url_or_none(subs.get('Uri')) + if not sub_uri: + continue + lang = subs.get('Language') or 'da' + subtitles.setdefault(LANGS.get(lang, lang), []).append({ + 'url': sub_uri, + 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' + }) if not formats and restricted_to_denmark: self.raise_geo_restricted( @@ -170,6 +236,13 @@ class DRTVIE(InfoExtractor): 'duration': duration, 'formats': formats, 'subtitles': subtitles, + 'series': str_or_none(data.get('SeriesTitle')), + 'season': str_or_none(data.get('SeasonTitle')), + 'season_number': int_or_none(data.get('SeasonNumber')), + 'season_id': str_or_none(data.get('SeasonUrn')), + 'episode': str_or_none(data.get('EpisodeTitle')), + 'episode_number': int_or_none(data.get('EpisodeNumber')), + 'release_year': int_or_none(data.get('ProductionYear')), } diff --git a/youtube_dl/extractor/dtube.py b/youtube_dl/extractor/dtube.py index 4ca97f860..114d2dbe3 100644 --- a/youtube_dl/extractor/dtube.py +++ b/youtube_dl/extractor/dtube.py @@ -15,16 +15,16 @@ from ..utils import ( class DTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P<uploader_id>[0-9a-z.-]+)/(?P<id>[0-9a-z]{8})' _TEST = { - 'url': 'https://d.tube/#!/v/benswann/zqd630em', - 'md5': 'a03eaa186618ffa7a3145945543a251e', + 'url': 'https://d.tube/#!/v/broncnutz/x380jtr1', + 'md5': '9f29088fa08d699a7565ee983f56a06e', 'info_dict': { - 'id': 'zqd630em', + 'id': 'x380jtr1', 'ext': 'mp4', - 'title': 'Reality Check: FDA\'s Disinformation Campaign on Kratom', - 'description': 'md5:700d164e066b87f9eac057949e4227c2', - 'uploader_id': 'benswann', - 'upload_date': '20180222', - 'timestamp': 1519328958, + 'title': 'Lefty 3-Rings is Back Baby!! NCAA Picks', + 'description': 'md5:60be222088183be3a42f196f34235776', + 'uploader_id': 'broncnutz', + 'upload_date': '20190107', + 'timestamp': 1546854054, }, 'params': { 'format': '480p', @@ -48,7 +48,7 @@ class DTubeIE(InfoExtractor): def canonical_url(h): if not h: return None - return 'https://ipfs.io/ipfs/' + h + return 'https://video.dtube.top/ipfs/' + h formats = [] for q in ('240', '480', '720', '1080', ''): @@ -59,7 +59,7 @@ class DTubeIE(InfoExtractor): try: self.to_screen('%s: Checking %s video format URL' % (video_id, format_id)) self._downloader._opener.open(video_url, timeout=5).close() - except timeout as e: + except timeout: self.to_screen( '%s: %s URL is invalid, skipping' % (video_id, format_id)) continue diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 42789278e..36fef07b7 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -4,14 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, unsmuggle_url, + url_or_none, ) @@ -177,7 +175,7 @@ class EaglePlatformIE(InfoExtractor): video_id, 'Downloading mp4 JSON', fatal=False) if mp4_data: for format_id, format_url in mp4_data.get('data', {}).items(): - if not isinstance(format_url, compat_str): + if not url_or_none(format_url): continue height = int_or_none(format_id) if height is not None and m3u8_formats_dict.get(height): diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index edabaafe6..df11dc206 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, try_get, unified_timestamp, + url_or_none, ) @@ -34,8 +35,8 @@ class EggheadCourseIE(InfoExtractor): entries = [] for lesson in lessons: - lesson_url = lesson.get('http_url') - if not lesson_url or not isinstance(lesson_url, compat_str): + lesson_url = url_or_none(lesson.get('http_url')) + if not lesson_url: continue lesson_id = lesson.get('id') if lesson_id: @@ -95,7 +96,8 @@ class EggheadLessonIE(InfoExtractor): formats = [] for _, format_url in lesson['media_urls'].items(): - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue ext = determine_ext(format_url) if ext == 'm3u8': diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 81f2e2ee1..c050bf9df 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -9,8 +9,10 @@ from ..utils import ( encode_base_n, ExtractorError, int_or_none, + merge_dicts, parse_duration, str_to_int, + url_or_none, ) @@ -24,10 +26,16 @@ class EpornerIE(InfoExtractor): 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', + 'description': 'md5:764f39abf932daafa37485eb46efa152', + 'timestamp': 1232520922, + 'upload_date': '20090121', 'duration': 1838, 'view_count': int, 'age_limit': 18, }, + 'params': { + 'proxy': '127.0.0.1:8118' + } }, { # New (May 2016) URL layout 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', @@ -82,8 +90,8 @@ class EpornerIE(InfoExtractor): for format_id, format_dict in formats_dict.items(): if not isinstance(format_dict, dict): continue - src = format_dict.get('src') - if not isinstance(src, compat_str) or not src.startswith('http'): + src = url_or_none(format_dict.get('src')) + if not src or not src.startswith('http'): continue if kind == 'hls': formats.extend(self._extract_m3u8_formats( @@ -103,12 +111,15 @@ class EpornerIE(InfoExtractor): }) self._sort_formats(formats) - duration = parse_duration(self._html_search_meta('duration', webpage)) + json_ld = self._search_json_ld(webpage, display_id, default={}) + + duration = parse_duration(self._html_search_meta( + 'duration', webpage, default=None)) view_count = str_to_int(self._search_regex( r'id="cinemaviews">\s*([0-9,]+)\s*<small>views', webpage, 'view count', fatal=False)) - return { + return merge_dicts(json_ld, { 'id': video_id, 'display_id': display_id, 'title': title, @@ -116,4 +127,4 @@ class EpornerIE(InfoExtractor): 'view_count': view_count, 'formats': formats, 'age_limit': 18, - } + }) diff --git a/youtube_dl/extractor/expressen.py b/youtube_dl/extractor/expressen.py new file mode 100644 index 000000000..934571472 --- /dev/null +++ b/youtube_dl/extractor/expressen.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + unescapeHTML, + unified_timestamp, +) + + +class ExpressenIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?expressen\.se/ + (?:(?:tvspelare/video|videoplayer/embed)/)? + tv/(?:[^/]+/)* + (?P<id>[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', + 'md5': '2fbbe3ca14392a6b1b36941858d33a45', + 'info_dict': { + 'id': '8690962', + 'ext': 'mp4', + 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden', + 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 788, + 'timestamp': 1526639109, + 'upload_date': '20180518', + }, + }, { + 'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/', + 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/tvspelare/video/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', + 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/videoplayer/embed/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') for mobj in re.finditer( + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?expressen\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1', + webpage)] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + def extract_data(name): + return self._parse_json( + self._search_regex( + r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name, + webpage, 'info', group='value'), + display_id, transform_source=unescapeHTML) + + info = extract_data('video-tracking-info') + video_id = info['videoId'] + + data = extract_data('article-data') + stream = data['stream'] + + if determine_ext(stream) == 'm3u8': + formats = self._extract_m3u8_formats( + stream, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + formats = [{ + 'url': stream, + }] + self._sort_formats(formats) + + title = info.get('titleRaw') or data['title'] + description = info.get('descriptionRaw') + thumbnail = info.get('socialMediaImage') or data.get('image') + duration = int_or_none(info.get('videoTotalSecondsDuration') or + data.get('totalSecondsDuration')) + timestamp = unified_timestamp(info.get('publishDate')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ad81c26e8..e91cf708a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -55,6 +55,7 @@ from .appletrailers import ( from .archiveorg import ArchiveOrgIE from .arkena import ArkenaIE from .ard import ( + ARDBetaMediathekIE, ARDIE, ARDMediathekIE, ) @@ -88,11 +89,7 @@ from .awaan import ( AWAANLiveIE, AWAANSeasonIE, ) -from .azmedien import ( - AZMedienIE, - AZMedienPlaylistIE, - AZMedienShowPlaylistIE, -) +from .azmedien import AZMedienIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE @@ -119,6 +116,10 @@ from .bilibili import ( BiliBiliBangumiIE, ) from .biobiochiletv import BioBioChileTVIE +from .bitchute import ( + BitChuteIE, + BitChuteChannelIE, +) from .biqle import BIQLEIE from .bleacherreport import ( BleacherReportIE, @@ -194,6 +195,10 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE +from .ciscolive import ( + CiscoLiveSessionIE, + CiscoLiveSearchIE, +) from .cjsw import CJSWIE from .cliphunter import CliphunterIE from .clippit import ClippitIE @@ -205,7 +210,10 @@ from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE from .cmt import CMTIE -from .cnbc import CNBCIE +from .cnbc import ( + CNBCIE, + CNBCVideoIE, +) from .cnn import ( CNNIE, CNNBlogsIE, @@ -336,6 +344,7 @@ from .esri import EsriVideoIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE +from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE from .facebook import ( @@ -373,7 +382,6 @@ from .foxgay import FoxgayIE from .foxnews import ( FoxNewsIE, FoxNewsArticleIE, - FoxNewsInsiderIE, ) from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE @@ -391,6 +399,11 @@ from .francetv import ( from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE +from .frontendmasters import ( + FrontendMastersIE, + FrontendMastersLessonIE, + FrontendMastersCourseIE +) from .funimation import FunimationIE from .funk import ( FunkMixIE, @@ -399,6 +412,7 @@ from .funk import ( from .funnyordie import FunnyOrDieIE from .fusion import FusionIE from .fxnetworks import FXNetworksIE +from .gaia import GaiaIE from .gameinformer import GameInformerIE from .gameone import ( GameOneIE, @@ -439,6 +453,7 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hgtv import HGTVComShowIE +from .hketv import HKETVIE from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE @@ -457,6 +472,10 @@ from .hrti import ( ) from .huajiao import HuajiaoIE from .huffpost import HuffPostIE +from .hungama import ( + HungamaIE, + HungamaSongIE, +) from .hypem import HypemIE from .iconosquare import IconosquareIE from .ign import ( @@ -471,12 +490,17 @@ from .imdb import ( from .imgur import ( ImgurIE, ImgurAlbumIE, + ImgurGalleryIE, ) from .ina import InaIE from .inc import IncIE from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE -from .instagram import InstagramIE, InstagramUserIE +from .instagram import ( + InstagramIE, + InstagramUserIE, + InstagramTagIE, +) from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE @@ -512,6 +536,7 @@ from .keezmovies import KeezMoviesIE from .ketnet import KetnetIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE +from .kinopoisk import KinoPoiskIE from .keek import KeekIE from .konserthusetplay import KonserthusetPlayIE from .kontrtube import KontrTubeIE @@ -530,6 +555,7 @@ from .la7 import LA7IE from .laola1tv import ( Laola1TvEmbedIE, Laola1TvIE, + EHFTVIE, ITTFIE, ) from .lci import LCIIE @@ -539,6 +565,11 @@ from .lcp import ( ) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE +from .lecturio import ( + LecturioIE, + LecturioCourseIE, + LecturioDeCourseIE, +) from .leeco import ( LeIE, LePlaylistIE, @@ -559,6 +590,11 @@ from .limelight import ( LimelightChannelListIE, ) from .line import LineTVIE +from .linkedin import ( + LinkedInLearningIE, + LinkedInLearningCourseIE, +) +from .linuxacademy import LinuxAcademyIE from .litv import LiTVIE from .liveleak import ( LiveLeakIE, @@ -585,11 +621,16 @@ from .mailru import ( MailRuMusicSearchIE, ) from .makertv import MakerTVIE +from .malltv import MallTVIE from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE @@ -656,8 +697,7 @@ from .myvi import ( from .myvidster import MyVidsterIE from .nationalgeographic import ( NationalGeographicVideoIE, - NationalGeographicIE, - NationalGeographicEpisodeGuideIE, + NationalGeographicTVIE, ) from .naver import NaverIE from .nba import NBAIE @@ -728,7 +768,10 @@ from .nonktube import NonkTubeIE from .noovo import NoovoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE -from .nova import NovaIE +from .nova import ( + NovaEmbedIE, + NovaIE, +) from .novamov import ( AuroraVidIE, CloudTimeIE, @@ -760,7 +803,9 @@ from .nrk import ( NRKSkoleIE, NRKTVIE, NRKTVDirekteIE, + NRKTVEpisodeIE, NRKTVEpisodesIE, + NRKTVSeasonIE, NRKTVSeriesIE, ) from .ntvde import NTVDeIE @@ -795,6 +840,7 @@ from .orf import ( ORFOE1IE, ORFIPTVIE, ) +from .outsidetv import OutsideTVIE from .packtpub import ( PacktPubIE, PacktPubCourseIE, @@ -823,6 +869,7 @@ from .piksel import PikselIE from .pinkbike import PinkbikeIE from .pladform import PladformIE from .playfm import PlayFMIE +from .playplustv import PlayPlusTVIE from .plays import PlaysTVIE from .playtvak import PlaytvakIE from .playvid import PlayvidIE @@ -850,6 +897,10 @@ from .pornhub import ( from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE +from .puhutv import ( + PuhuTVIE, + PuhuTVSerieIE, +) from .presstv import PressTVIE from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE @@ -881,7 +932,10 @@ from .rai import ( RaiPlayPlaylistIE, RaiIE, ) -from .raywenderlich import RayWenderlichIE +from .raywenderlich import ( + RayWenderlichIE, + RayWenderlichCourseIE, +) from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import RedBullTVIE @@ -1007,7 +1061,10 @@ from .southpark import ( SouthParkEsIE, SouthParkNlIE ) -from .spankbang import SpankBangIE +from .spankbang import ( + SpankBangIE, + SpankBangPlaylistIE, +) from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE @@ -1017,7 +1074,7 @@ from .spike import ( ) from .stitcher import StitcherIE from .sport5 import Sport5IE -from .sportbox import SportBoxEmbedIE +from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE @@ -1037,6 +1094,7 @@ from .stretchinternet import StretchInternetIE from .sunporno import SunPornoIE from .svt import ( SVTIE, + SVTPageIE, SVTPlayIE, SVTSeriesIE, ) @@ -1051,6 +1109,10 @@ from .tass import TassIE from .tastytrade import TastyTradeIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE +from .teachable import ( + TeachableIE, + TeachableCourseIE, +) from .teachertube import ( TeacherTubeIE, TeacherTubeUserIE, @@ -1059,6 +1121,7 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .tele5 import Tele5IE from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE @@ -1088,6 +1151,10 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE +from .tiktok import ( + TikTokIE, + TikTokUserIE, +) from .tinypic import TinyPicIE from .tmz import ( TMZIE, @@ -1106,6 +1173,7 @@ from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trunews import TruNewsIE from .trutv import TruTVIE from .tube8 import Tube8IE from .tubitv import TubiTvIE @@ -1125,7 +1193,6 @@ from .tv2 import ( TV2ArticleIE, ) from .tv2hu import TV2HuIE -from .tv3 import TV3IE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE from .tva import TVAIE @@ -1144,23 +1211,27 @@ from .tvnet import TVNetIE from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, - TVNowListIE, + TVNowNewIE, + TVNowSeasonIE, + TVNowAnnualIE, TVNowShowIE, ) from .tvp import ( TVPEmbedIE, TVPIE, - TVPSeriesIE, + TVPWebsiteIE, ) from .tvplay import ( TVPlayIE, ViafreeIE, + TVPlayHomeIE, ) from .tvplayer import TVPlayerIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE +from .twitcasting import TwitCastingIE from .twitch import ( TwitchVideoIE, TwitchChapterIE, @@ -1194,10 +1265,6 @@ from .uplynk import ( UplynkIE, UplynkPreplayIE, ) -from .upskill import ( - UpskillIE, - UpskillCourseIE, -) from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE @@ -1266,6 +1333,7 @@ from .vimeo import ( VimeoReviewIE, VimeoUserIE, VimeoWatchLaterIE, + VHXEmbedIE, ) from .vimple import VimpleIE from .vine import ( @@ -1276,6 +1344,7 @@ from .viki import ( VikiIE, VikiChannelIE, ) +from .viqeo import ViqeoIE from .viu import ( ViuIE, ViuPlaylistIE, @@ -1300,7 +1369,6 @@ from .voxmedia import ( VoxMediaVolumeIE, VoxMediaIE, ) -from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE from .vrv import ( @@ -1314,6 +1382,7 @@ from .vuclip import VuClipIE from .vvvvid import VVVVIDIE from .vyborymos import VyboryMosIE from .vzaar import VzaarIE +from .wakanim import WakanimIE from .walla import WallaIE from .washingtonpost import ( WashingtonPostIE, @@ -1352,6 +1421,7 @@ from .wsj import ( WSJIE, WSJArticleIE, ) +from .wwe import WWEIE from .xbef import XBefIE from .xboxclips import XboxClipsIE from .xfileshare import XFileShareIE @@ -1401,6 +1471,7 @@ from .younow import ( YouNowMomentIE, ) from .youporn import YouPornIE +from .yourporn import YourPornIE from .yourupload import YourUploadIE from .youtube import ( YoutubeIE, @@ -1424,10 +1495,24 @@ from .youtube import ( from .zapiks import ZapiksIE from .zaq1 import Zaq1IE from .zattoo import ( + BBVTVIE, + EinsUndEinsTVIE, + EWETVIE, + GlattvisionTVIE, + MNetTVIE, + MyVisionTVIE, + NetPlusIE, + OsnatelTVIE, + QuantumTVIE, QuicklineIE, QuicklineLiveIE, + SaltTVIE, + SAKTVIE, + VTXTVIE, + WalyTVIE, ZattooIE, ZattooLiveIE, ) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE +from .zype import ZypeIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8a9ed96c2..74954049d 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -20,6 +20,7 @@ from ..utils import ( int_or_none, js_to_json, limit_length, + parse_count, sanitized_Request, try_get, urlencode_postdata, @@ -56,7 +57,7 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' - _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', @@ -75,7 +76,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif Nawab Butt posted a video to his Timeline.', + 'title': 're:^Asif Nawab Butt posted a video', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, @@ -133,7 +134,7 @@ class FacebookIE(InfoExtractor): }, { # have 1080P, but only up to 720p in swf params 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '0d9813160b146b3bc8744e006027fcc6', + 'md5': '9571fae53d4165bbbadb17a94651dcdc', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -142,6 +143,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20161030', 'uploader': 'CNN', 'thumbnail': r're:^https?://.*', + 'view_count': int, }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -149,7 +151,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'md5:a7b86ca673f51800cd54687b7f4012fe', + 'title': 'md5:1db063d6a8c13faa8da727817339c857', 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', @@ -176,7 +178,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1396382447100162', 'ext': 'mp4', - 'title': 'md5:e2d2700afdf84e121f5d0f999bad13a3', + 'title': 'md5:19a428bbde91364e3de815383b54a235', 'timestamp': 1486035494, 'upload_date': '20170202', 'uploader': 'Elisabeth Ahtn', @@ -353,7 +355,6 @@ class FacebookIE(InfoExtractor): tahoe_data = self._download_webpage( self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, data=urlencode_postdata({ - '__user': 0, '__a': 1, '__pc': self._search_regex( r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, @@ -361,6 +362,9 @@ class FacebookIE(InfoExtractor): '__rev': self._search_regex( r'client_revision["\']\s*:\s*(\d+),', webpage, 'client revision', default='3944515'), + 'fb_dtsg': self._search_regex( + r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', + webpage, 'dtsg token', default=''), }), headers={ 'Content-Type': 'application/x-www-form-urlencoded', @@ -426,6 +430,10 @@ class FacebookIE(InfoExtractor): 'timestamp', default=None)) thumbnail = self._og_search_thumbnail(webpage) + view_count = parse_count(self._search_regex( + r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', + default=None)) + info_dict = { 'id': video_id, 'title': video_title, @@ -433,6 +441,7 @@ class FacebookIE(InfoExtractor): 'uploader': uploader, 'timestamp': timestamp, 'thumbnail': thumbnail, + 'view_count': view_count, } return webpage, info_dict diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 4803a22c8..28617d83c 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, qualities, unified_strdate, + url_or_none, ) @@ -88,8 +89,8 @@ class FirstTVIE(InfoExtractor): formats = [] path = None for f in item.get('mbr', []): - src = f.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(f.get('src')) + if not src: continue tbr = int_or_none(self._search_regex( r'_(\d{3,})\.mp4', src, 'tbr', default=None)) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index ad273a0e7..a9a1f911e 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -3,15 +3,45 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_b64decode, + compat_str, + compat_urllib_parse_unquote, + compat_urlparse, +) from ..utils import ( + int_or_none, parse_duration, parse_iso8601, + str_or_none, str_to_int, + try_get, + unified_timestamp, + url_or_none, ) class FourTubeBaseIE(InfoExtractor): + _TKN_HOST = 'tkn.kodicdn.com' + + def _extract_formats(self, url, video_id, media_id, sources): + token_url = 'https://%s/%s/desktop/%s' % ( + self._TKN_HOST, media_id, '+'.join(sources)) + + parsed_url = compat_urlparse.urlparse(url) + tokens = self._download_json(token_url, video_id, data=b'', headers={ + 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), + 'Referer': url, + }) + formats = [{ + 'url': tokens[format]['token'], + 'format_id': format + 'p', + 'resolution': format + 'p', + 'quality': int(format), + } for format in sources] + self._sort_formats(formats) + return formats + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) kind, video_id, display_id = mobj.group('kind', 'id', 'display_id') @@ -68,21 +98,7 @@ class FourTubeBaseIE(InfoExtractor): media_id = params[0] sources = ['%s' % p for p in params[2]] - token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format( - media_id, '+'.join(sources)) - - parsed_url = compat_urlparse.urlparse(url) - tokens = self._download_json(token_url, video_id, data=b'', headers={ - 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), - 'Referer': url, - }) - formats = [{ - 'url': tokens[format]['token'], - 'format_id': format + 'p', - 'resolution': format + 'p', - 'quality': int(format), - } for format in sources] - self._sort_formats(formats) + formats = self._extract_formats(url, video_id, media_id, sources) return { 'id': video_id, @@ -164,6 +180,7 @@ class FuxIE(FourTubeBaseIE): class PornTubeIE(FourTubeBaseIE): _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s' + _TKN_HOST = 'tkn.porntube.com' _TESTS = [{ 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759', 'info_dict': { @@ -171,13 +188,32 @@ class PornTubeIE(FourTubeBaseIE): 'ext': 'mp4', 'title': 'Teen couple doing anal', 'uploader': 'Alexy', - 'uploader_id': 'Alexy', + 'uploader_id': '91488', 'upload_date': '20150606', 'timestamp': 1433595647, 'duration': 5052, 'view_count': int, 'like_count': int, - 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.porntube.com/videos/squirting-teen-ballerina-ecg_1331406', + 'info_dict': { + 'id': '1331406', + 'ext': 'mp4', + 'title': 'Squirting Teen Ballerina on ECG', + 'uploader': 'Exploited College Girls', + 'uploader_id': '665', + 'channel': 'Exploited College Girls', + 'channel_id': '665', + 'upload_date': '20130920', + 'timestamp': 1379685485, + 'duration': 851, + 'view_count': int, + 'like_count': int, 'age_limit': 18, }, 'params': { @@ -191,6 +227,55 @@ class PornTubeIE(FourTubeBaseIE): 'only_matching': True, }] + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + video = self._parse_json( + self._search_regex( + r'INITIALSTATE\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'data', group='value'), video_id, + transform_source=lambda x: compat_urllib_parse_unquote( + compat_b64decode(x).decode('utf-8')))['page']['video'] + + title = video['title'] + media_id = video['mediaId'] + sources = [compat_str(e['height']) + for e in video['encodings'] if e.get('height')] + formats = self._extract_formats(url, video_id, media_id, sources) + + thumbnail = url_or_none(video.get('masterThumb')) + uploader = try_get(video, lambda x: x['user']['username'], compat_str) + uploader_id = str_or_none(try_get( + video, lambda x: x['user']['id'], int)) + channel = try_get(video, lambda x: x['channel']['name'], compat_str) + channel_id = str_or_none(try_get( + video, lambda x: x['channel']['id'], int)) + like_count = int_or_none(video.get('likes')) + dislike_count = int_or_none(video.get('dislikes')) + view_count = int_or_none(video.get('playsQty')) + duration = int_or_none(video.get('durationInSeconds')) + timestamp = unified_timestamp(video.get('publishedAt')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'uploader': uploader or channel, + 'uploader_id': uploader_id or channel_id, + 'channel': channel, + 'channel_id': channel_id, + 'timestamp': timestamp, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'view_count': view_count, + 'duration': duration, + 'age_limit': 18, + } + class PornerBrosIE(FourTubeBaseIE): _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 11d6c9c32..0ffceeb7c 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -1,17 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import uuid + from .adobepass import AdobePassIE -from .uplynk import UplynkPreplayIE -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( - HEADRequest, int_or_none, parse_age_limit, parse_duration, try_get, unified_timestamp, - update_url_query, ) @@ -31,6 +34,7 @@ class FOXIE(AdobePassIE): 'upload_date': '20170901', 'creator': 'FOX', 'series': 'Gotham', + 'age_limit': 14, }, 'params': { 'skip_download': True, @@ -44,48 +48,54 @@ class FOXIE(AdobePassIE): 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/', 'only_matching': True, }] + _HOME_PAGE_URL = 'https://www.fox.com/' + _API_KEY = 'abdcbed02c124d393b39e818a4312055' + _access_token = None + + def _call_api(self, path, video_id, data=None): + headers = { + 'X-Api-Key': self._API_KEY, + } + if self._access_token: + headers['Authorization'] = 'Bearer ' + self._access_token + return self._download_json( + 'https://api2.fox.com/v2.0/' + path, + video_id, data=data, headers=headers) + + def _real_initialize(self): + if not self._access_token: + mvpd_auth = self._get_cookies(self._HOME_PAGE_URL).get('mvpd-auth') + if mvpd_auth: + self._access_token = (self._parse_json(compat_urllib_parse_unquote( + mvpd_auth.value), None, fatal=False) or {}).get('accessToken') + if not self._access_token: + self._access_token = self._call_api( + 'login', None, json.dumps({ + 'deviceId': compat_str(uuid.uuid4()), + }).encode())['accessToken'] def _real_extract(self, url): video_id = self._match_id(url) - video = self._download_json( - 'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id, - video_id, headers={ - 'apikey': 'abdcbed02c124d393b39e818a4312055', - 'Content-Type': 'application/json', - 'Referer': url, - }) + video = self._call_api('vodplayer/' + video_id, video_id) title = video['name'] - release_url = video['videoRelease']['url'] - - description = video.get('description') - duration = int_or_none(video.get('durationInSeconds')) or int_or_none( - video.get('duration')) or parse_duration(video.get('duration')) - timestamp = unified_timestamp(video.get('datePublished')) - rating = video.get('contentRating') - age_limit = parse_age_limit(rating) + release_url = video['url'] + m3u8_url = self._download_json(release_url, video_id)['playURL'] + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) data = try_get( video, lambda x: x['trackingData']['properties'], dict) or {} + duration = int_or_none(video.get('durationInSeconds')) or int_or_none( + video.get('duration')) or parse_duration(video.get('duration')) + timestamp = unified_timestamp(video.get('datePublished')) creator = data.get('brand') or data.get('network') or video.get('network') - series = video.get('seriesName') or data.get( 'seriesName') or data.get('show') - season_number = int_or_none(video.get('seasonNumber')) - episode = video.get('name') - episode_number = int_or_none(video.get('episodeNumber')) - release_year = int_or_none(video.get('releaseYear')) - - if data.get('authRequired'): - resource = self._get_mvpd_resource( - 'fbc-fox', title, video.get('guid'), rating) - release_url = update_url_query( - release_url, { - 'auth': self._extract_mvpd_auth( - url, video_id, 'fbc-fox', resource) - }) subtitles = {} for doc_rel in video.get('documentReleases', []): @@ -98,36 +108,19 @@ class FOXIE(AdobePassIE): }] break - info = { + return { 'id': video_id, 'title': title, - 'description': description, + 'formats': formats, + 'description': video.get('description'), 'duration': duration, 'timestamp': timestamp, - 'age_limit': age_limit, + 'age_limit': parse_age_limit(video.get('contentRating')), 'creator': creator, 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'release_year': release_year, + 'season_number': int_or_none(video.get('seasonNumber')), + 'episode': video.get('name'), + 'episode_number': int_or_none(video.get('episodeNumber')), + 'release_year': int_or_none(video.get('releaseYear')), 'subtitles': subtitles, } - - urlh = self._request_webpage(HEADRequest(release_url), video_id) - video_url = compat_str(urlh.geturl()) - - if UplynkPreplayIE.suitable(video_url): - info.update({ - '_type': 'url_transparent', - 'url': video_url, - 'ie_key': UplynkPreplayIE.ie_key(), - }) - else: - m3u8_url = self._download_json(release_url, video_id)['playURL'] - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) - info['formats'] = formats - return info diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index dc0662f74..63613cb85 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -58,6 +58,14 @@ class FoxNewsIE(AMPIE): }, ] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', + webpage)] + def _real_extract(self, url): host, video_id = re.match(self._VALID_URL, url).groups() @@ -68,21 +76,41 @@ class FoxNewsIE(AMPIE): class FoxNewsArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' - _TEST = { + _TESTS = [{ + # data-video-id 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '62aa5a781b308fdee212ebb6f33ae7ef', + 'md5': '83d44e1aff1433e7a29a7b537d1700b5', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', 'description': 'Veterans react on \'The Kelly File\'', - 'timestamp': 1473299755, + 'timestamp': 1473301045, 'upload_date': '20160908', }, - } + }, { + # iframe embed + 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'info_dict': { + 'id': '5748266721001', + 'ext': 'flv', + 'title': 'Kyle Kashuv has a positive message for the Trump White House', + 'description': 'Marjory Stoneman Douglas student disagrees with classmates.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 229, + 'timestamp': 1520594670, + 'upload_date': '20180309', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -90,51 +118,10 @@ class FoxNewsArticleIE(InfoExtractor): video_id = self._html_search_regex( r'data-video-id=([\'"])(?P<id>[^\'"]+)\1', - webpage, 'video ID', group='id') + webpage, 'video ID', group='id', default=None) + if video_id: + return self.url_result( + 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) + return self.url_result( - 'http://video.foxnews.com/v/' + video_id, - FoxNewsIE.ie_key()) - - -class FoxNewsInsiderIE(InfoExtractor): - _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P<id>[a-z-]+)' - IE_NAME = 'foxnews:insider' - - _TEST = { - 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', - 'md5': 'a10c755e582d28120c62749b4feb4c0c', - 'info_dict': { - 'id': '5099377331001', - 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words', - 'ext': 'mp4', - 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive', - 'description': 'Is campus censorship getting out of control?', - 'timestamp': 1472168725, - 'upload_date': '20160825', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': [FoxNewsIE.ie_key()], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL') - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - - return { - '_type': 'url_transparent', - 'ie_key': FoxNewsIE.ie_key(), - 'url': embed_url, - 'display_id': display_id, - 'title': title, - 'description': description, - } + FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index 985542727..2b2cb6c6f 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -1,43 +1,33 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - smuggle_url, - update_url_query, -) class FoxSportsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*video/(?P<id>\d+)' _TEST = { 'url': 'http://www.foxsports.com/tennessee/video/432609859715', 'md5': 'b49050e955bebe32c301972e4012ac17', 'info_dict': { - 'id': 'bwduI3X_TgUB', + 'id': '432609859715', 'ext': 'mp4', 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', 'description': 'Courtney Lee talks about Memphis being focused.', - 'upload_date': '20150423', - 'timestamp': 1429761109, + # TODO: fix timestamp + 'upload_date': '19700101', # '20150423', + # 'timestamp': 1429761109, 'uploader': 'NEWA-FNG-FOXSPORTS', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'add_ie': ['ThePlatform'], } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - config = self._parse_json( - self._html_search_regex( - r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""", - webpage, 'data player config'), - video_id) - - return self.url_result(smuggle_url(update_url_query( - config['releaseURL'], { - 'mbr': 'true', - 'switch': 'http', - }), {'force_smil_url': True})) + return self.url_result( + 'https://feed.theplatform.com/f/BKQ29B/foxsports-all?byId=' + video_id, 'ThePlatformFeed') diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 6fc6b0da0..2ffe83a78 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -16,6 +16,7 @@ from ..utils import ( int_or_none, parse_duration, try_get, + url_or_none, ) from .dailymotion import DailymotionIE @@ -115,14 +116,13 @@ class FranceTVIE(InfoExtractor): def sign(manifest_url, manifest_id): for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): - signed_url = self._download_webpage( + signed_url = url_or_none(self._download_webpage( 'https://%s/esi/TA' % host, video_id, 'Downloading signed %s manifest URL' % manifest_id, fatal=False, query={ 'url': manifest_url, - }) - if (signed_url and isinstance(signed_url, compat_str) and - re.search(r'^(?:https?:)?//', signed_url)): + })) + if signed_url: return signed_url return manifest_url diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py index 486a49c05..ea9c3e317 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/youtube_dl/extractor/freespeech.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .youtube import YoutubeIE class FreespeechIE(InfoExtractor): @@ -27,8 +28,4 @@ class FreespeechIE(InfoExtractor): r'data-video-url="([^"]+)"', webpage, 'youtube url') - return { - '_type': 'url', - 'url': youtube_url, - 'ie_key': 'Youtube', - } + return self.url_result(youtube_url, YoutubeIE.ie_key()) diff --git a/youtube_dl/extractor/frontendmasters.py b/youtube_dl/extractor/frontendmasters.py new file mode 100644 index 000000000..cb57ba007 --- /dev/null +++ b/youtube_dl/extractor/frontendmasters.py @@ -0,0 +1,263 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_duration, + url_or_none, + urlencode_postdata, +) + + +class FrontendMastersBaseIE(InfoExtractor): + _API_BASE = 'https://api.frontendmasters.com/v1/kabuki' + _LOGIN_URL = 'https://frontendmasters.com/login/' + + _NETRC_MACHINE = 'frontendmasters' + + _QUALITIES = { + 'low': {'width': 480, 'height': 360}, + 'mid': {'width': 1280, 'height': 720}, + 'high': {'width': 1920, 'height': 1080} + } + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post_url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + # Successful login + if any(p in response for p in ( + 'wp-login.php?action=logout', '>Logout')): + return + + error = self._html_search_regex( + r'class=(["\'])(?:(?!\1).)*\bMessageAlert\b(?:(?!\1).)*\1[^>]*>(?P<error>[^<]+)<', + response, 'error message', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class FrontendMastersPageBaseIE(FrontendMastersBaseIE): + def _download_course(self, course_name, url): + return self._download_json( + '%s/courses/%s' % (self._API_BASE, course_name), course_name, + 'Downloading course JSON', headers={'Referer': url}) + + @staticmethod + def _extract_chapters(course): + chapters = [] + lesson_elements = course.get('lessonElements') + if isinstance(lesson_elements, list): + chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)] + return chapters + + @staticmethod + def _extract_lesson(chapters, lesson_id, lesson): + title = lesson.get('title') or lesson_id + display_id = lesson.get('slug') + description = lesson.get('description') + thumbnail = lesson.get('thumbnail') + + chapter_number = None + index = lesson.get('index') + element_index = lesson.get('elementIndex') + if (isinstance(index, int) and isinstance(element_index, int) and + index < element_index): + chapter_number = element_index - index + chapter = (chapters[chapter_number - 1] + if chapter_number - 1 < len(chapters) else None) + + duration = None + timestamp = lesson.get('timestamp') + if isinstance(timestamp, compat_str): + mobj = re.search( + r'(?P<start>\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P<end>\s*\d{1,2}:\d{1,2}:\d{1,2})', + timestamp) + if mobj: + duration = parse_duration(mobj.group('end')) - parse_duration( + mobj.group('start')) + + return { + '_type': 'url_transparent', + 'url': 'frontendmasters:%s' % lesson_id, + 'ie_key': FrontendMastersIE.ie_key(), + 'id': lesson_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'chapter': chapter, + 'chapter_number': chapter_number, + } + + +class FrontendMastersIE(FrontendMastersBaseIE): + _VALID_URL = r'(?:frontendmasters:|https?://api\.frontendmasters\.com/v\d+/kabuki/video/)(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://api.frontendmasters.com/v1/kabuki/video/a2qogef6ba', + 'md5': '7f161159710d6b7016a4f4af6fcb05e2', + 'info_dict': { + 'id': 'a2qogef6ba', + 'ext': 'mp4', + 'title': 'a2qogef6ba', + }, + 'skip': 'Requires FrontendMasters account credentials', + }, { + 'url': 'frontendmasters:a2qogef6ba', + 'only_matching': True, + }] + + def _real_extract(self, url): + lesson_id = self._match_id(url) + + source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id) + + formats = [] + for ext in ('webm', 'mp4'): + for quality in ('low', 'mid', 'high'): + resolution = self._QUALITIES[quality].copy() + format_id = '%s-%s' % (ext, quality) + format_url = self._download_json( + source_url, lesson_id, + 'Downloading %s source JSON' % format_id, query={ + 'f': ext, + 'r': resolution['height'], + }, headers={ + 'Referer': url, + }, fatal=False)['url'] + + if not format_url: + continue + + f = resolution.copy() + f.update({ + 'url': format_url, + 'ext': ext, + 'format_id': format_id, + }) + formats.append(f) + self._sort_formats(formats) + + subtitles = { + 'en': [{ + 'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id), + }] + } + + return { + 'id': lesson_id, + 'title': lesson_id, + 'formats': formats, + 'subtitles': subtitles + } + + +class FrontendMastersLessonIE(FrontendMastersPageBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<course_name>[^/]+)/(?P<lesson_name>[^/]+)' + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/tools', + 'info_dict': { + 'id': 'a2qogef6ba', + 'display_id': 'tools', + 'ext': 'mp4', + 'title': 'Tools', + 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'chapter': 'Introduction', + 'chapter_number': 1, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires FrontendMasters account credentials', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_name, lesson_name = mobj.group('course_name', 'lesson_name') + + course = self._download_course(course_name, url) + + lesson_id, lesson = next( + (video_id, data) + for video_id, data in course['lessonData'].items() + if data.get('slug') == lesson_name) + + chapters = self._extract_chapters(course) + return self._extract_lesson(chapters, lesson_id, lesson) + + +class FrontendMastersCourseIE(FrontendMastersPageBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<id>[^/]+)' + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/', + 'info_dict': { + 'id': 'web-development', + 'title': 'Introduction to Web Development', + 'description': 'md5:9317e6e842098bf725d62360e52d49a6', + }, + 'playlist_count': 81, + 'skip': 'Requires FrontendMasters account credentials', + } + + @classmethod + def suitable(cls, url): + return False if FrontendMastersLessonIE.suitable(url) else super( + FrontendMastersBaseIE, cls).suitable(url) + + def _real_extract(self, url): + course_name = self._match_id(url) + + course = self._download_course(course_name, url) + + chapters = self._extract_chapters(course) + + lessons = sorted( + course['lessonData'].values(), key=lambda data: data['index']) + + entries = [] + for lesson in lessons: + lesson_name = lesson.get('slug') + if not lesson_name: + continue + lesson_id = lesson.get('hash') or lesson.get('statsId') + entries.append(self._extract_lesson(chapters, lesson_id, lesson)) + + title = course.get('title') + description = course.get('description') + + return self.playlist_result(entries, course_name, title, description) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 07d01caec..8bbedca26 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import random +import string + from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -87,7 +90,7 @@ class FunimationIE(InfoExtractor): video_id = title_data.get('id') or self._search_regex([ r"KANE_customdimensions.videoID\s*=\s*'(\d+)';", - r'<iframe[^>]+src="/player/(\d+)"', + r'<iframe[^>]+src="/player/(\d+)', ], webpage, 'video_id', default=None) if not video_id: player_url = self._html_search_meta([ @@ -108,8 +111,10 @@ class FunimationIE(InfoExtractor): if self._TOKEN: headers['Authorization'] = 'Token %s' % self._TOKEN sources = self._download_json( - 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id, - video_id, headers=headers)['items'] + 'https://www.funimation.com/api/showexperience/%s/' % video_id, + video_id, headers=headers, query={ + 'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]), + })['items'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: error = self._parse_json(e.cause.read(), video_id)['errors'][0] diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py index 0ff058619..7e1af95e0 100644 --- a/youtube_dl/extractor/funk.py +++ b/youtube_dl/extractor/funk.py @@ -1,10 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor from .nexx import NexxIE +from ..compat import compat_str from ..utils import ( int_or_none, try_get, @@ -12,6 +14,19 @@ from ..utils import ( class FunkBaseIE(InfoExtractor): + _HEADERS = { + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8', + 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4', + } + _AUTH = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4' + + @staticmethod + def _make_headers(referer): + headers = FunkBaseIE._HEADERS.copy() + headers['Referer'] = referer + return headers + def _make_url_result(self, video): return { '_type': 'url_transparent', @@ -48,19 +63,19 @@ class FunkMixIE(FunkBaseIE): lists = self._download_json( 'https://www.funk.net/api/v3.1/curation/curatedLists/', - mix_id, headers={ - 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbC12Mi4wIiwic2NvcGUiOiJzdGF0aWMtY29udGVudC1hcGksY3VyYXRpb24tc2VydmljZSxzZWFyY2gtYXBpIn0.SGCC1IXHLtZYoo8PvRKlU2gXH1su8YSu47sB3S4iXBI', - 'Referer': url, - }, query={ + mix_id, headers=self._make_headers(url), query={ 'size': 100, - })['result']['lists'] + })['_embedded']['curatedListList'] metas = next( l for l in lists if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas'] video = next( meta['videoDataDelegate'] - for meta in metas if meta.get('alias') == alias) + for meta in metas + if try_get( + meta, lambda x: x['videoDataDelegate']['alias'], + compat_str) == alias) return self._make_url_result(video) @@ -104,25 +119,53 @@ class FunkChannelIE(FunkBaseIE): channel_id = mobj.group('id') alias = mobj.group('alias') - headers = { - 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc', - 'Referer': url, - } + headers = self._make_headers(url) video = None - by_id_list = self._download_json( - 'https://www.funk.net/api/v3.0/content/videos/byIdList', channel_id, - headers=headers, query={ - 'ids': alias, - }, fatal=False) - if by_id_list: - video = try_get(by_id_list, lambda x: x['result'][0], dict) + # Id-based channels are currently broken on their side: webplayer + # tries to process them via byChannelAlias endpoint and fails + # predictably. + for page_num in itertools.count(): + by_channel_alias = self._download_json( + 'https://www.funk.net/api/v3.1/webapp/videos/byChannelAlias/%s' + % channel_id, + 'Downloading byChannelAlias JSON page %d' % (page_num + 1), + headers=headers, query={ + 'filterFsk': 'false', + 'sort': 'creationDate,desc', + 'size': 100, + 'page': page_num, + }, fatal=False) + if not by_channel_alias: + break + video_list = try_get( + by_channel_alias, lambda x: x['_embedded']['videoList'], list) + if not video_list: + break + try: + video = next(r for r in video_list if r.get('alias') == alias) + break + except StopIteration: + pass + if not try_get( + by_channel_alias, lambda x: x['_links']['next']): + break + + if not video: + by_id_list = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/byIdList', + channel_id, 'Downloading byIdList JSON', headers=headers, + query={ + 'ids': alias, + }, fatal=False) + if by_id_list: + video = try_get(by_id_list, lambda x: x['result'][0], dict) if not video: results = self._download_json( - 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, - headers=headers, query={ + 'https://www.funk.net/api/v3.0/content/videos/filter', + channel_id, 'Downloading filter JSON', headers=headers, query={ 'channelId': channel_id, 'size': 100, })['result'] diff --git a/youtube_dl/extractor/gaia.py b/youtube_dl/extractor/gaia.py new file mode 100644 index 000000000..f2eef3f4c --- /dev/null +++ b/youtube_dl/extractor/gaia.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + strip_or_none, + try_get, +) + + +class GaiaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gaia\.com/video/(?P<id>[^/?]+).*?\bfullplayer=(?P<type>feature|preview)' + _TESTS = [{ + 'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=feature', + 'info_dict': { + 'id': '89356', + 'ext': 'mp4', + 'title': 'Connecting with Universal Consciousness', + 'description': 'md5:844e209ad31b7d31345f5ed689e3df6f', + 'upload_date': '20151116', + 'timestamp': 1447707266, + 'duration': 936, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=preview', + 'info_dict': { + 'id': '89351', + 'ext': 'mp4', + 'title': 'Connecting with Universal Consciousness', + 'description': 'md5:844e209ad31b7d31345f5ed689e3df6f', + 'upload_date': '20151116', + 'timestamp': 1447707266, + 'duration': 53, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + display_id, vtype = re.search(self._VALID_URL, url).groups() + node_id = self._download_json( + 'https://brooklyn.gaia.com/pathinfo', display_id, query={ + 'path': 'video/' + display_id, + })['id'] + node = self._download_json( + 'https://brooklyn.gaia.com/node/%d' % node_id, node_id) + vdata = node[vtype] + media_id = compat_str(vdata['nid']) + title = node['title'] + + media = self._download_json( + 'https://brooklyn.gaia.com/media/' + media_id, media_id) + formats = self._extract_m3u8_formats( + media['mediaUrls']['bcHLS'], media_id, 'mp4') + self._sort_formats(formats) + + subtitles = {} + text_tracks = media.get('textTracks', {}) + for key in ('captions', 'subtitles'): + for lang, sub_url in text_tracks.get(key, {}).items(): + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + + fivestar = node.get('fivestar', {}) + fields = node.get('fields', {}) + + def get_field_value(key, value_key='value'): + return try_get(fields, lambda x: x[key][0][value_key]) + + return { + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'description': strip_or_none(get_field_value('body') or get_field_value('teaser')), + 'timestamp': int_or_none(node.get('created')), + 'subtitles': subtitles, + 'duration': int_or_none(vdata.get('duration')), + 'like_count': int_or_none(try_get(fivestar, lambda x: x['up_count']['value'])), + 'dislike_count': int_or_none(try_get(fivestar, lambda x: x['down_count']['value'])), + 'comment_count': int_or_none(node.get('comment_count')), + 'series': try_get(node, lambda x: x['series']['title'], compat_str), + 'season_number': int_or_none(get_field_value('season')), + 'season_id': str_or_none(get_field_value('series_nid', 'nid')), + 'episode_number': int_or_none(get_field_value('episode')), + } diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index ab647dd41..4236a5ed8 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(OnceIE): - _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article)s/(?:[^/]+/\d+-|embed/)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article|review)s/(?:[^/]+/\d+-|embed/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', @@ -41,6 +41,9 @@ class GameSpotIE(OnceIE): }, { 'url': 'https://www.gamespot.com/articles/the-last-of-us-2-receives-new-ps4-trailer/1100-6454469/', 'only_matching': True, + }, { + 'url': 'https://www.gamespot.com/reviews/gears-of-war-review/1900-6161188/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dad951b75..067de28cd 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -47,7 +47,7 @@ from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .tvc import TVCIE -from .sportbox import SportBoxEmbedIE +from .sportbox import SportBoxIE from .smotri import SmotriIE from .myvi import MyviIE from .condenast import CondeNastIE @@ -109,8 +109,13 @@ from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE +from .teachable import TeachableIE from .indavideo import IndavideoEmbedIE from .apa import APAIE +from .foxnews import FoxNewsIE +from .viqeo import ViqeoIE +from .expressen import ExpressenIE +from .zype import ZypeIE class GenericIE(InfoExtractor): @@ -1394,17 +1399,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - # SVT embed - { - 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', - 'info_dict': { - 'id': '2900353', - 'ext': 'flv', - 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', - 'duration': 27, - 'age_limit': 0, - }, - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', @@ -2069,6 +2063,44 @@ class GenericIE(InfoExtractor): }, 'skip': 'TODO: fix nested playlists processing in tests', }, + { + # Viqeo embeds + 'url': 'https://viqeo.tv/', + 'info_dict': { + 'id': 'viqeo', + 'title': 'All-new video platform', + }, + 'playlist_count': 6, + }, + { + # Zype embed + 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', + 'info_dict': { + 'id': '5b400b834b32992a310622b9', + 'ext': 'mp4', + 'title': 'Smoky Barbecue Favorites', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + 'add_ie': [ZypeIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, + { + # videojs embed + 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', + 'info_dict': { + 'id': 'shell', + 'ext': 'mp4', + 'title': 'Доставщик пиццы спросил разрешения сыграть на фортепиано', + 'description': 'md5:89209cdc587dab1e4a090453dbaa2cb1', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download MPD manifest'], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2165,10 +2197,7 @@ class GenericIE(InfoExtractor): def _real_extract(self, url): if url.startswith('//'): - return { - '_type': 'url', - 'url': self.http_scheme() + url, - } + return self.url_result(self.http_scheme() + url) parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: @@ -2620,9 +2649,9 @@ class GenericIE(InfoExtractor): return self.url_result(tvc_url, 'TVC') # Look for embedded SportBox player - sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) + sportbox_urls = SportBoxIE._extract_urls(webpage) if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed') + return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) # Look for embedded XHamster player xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) @@ -3007,7 +3036,7 @@ class GenericIE(InfoExtractor): wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) # Look for Mediaset embeds - mediaset_urls = MediasetIE._extract_urls(webpage) + mediaset_urls = MediasetIE._extract_urls(self, webpage) if mediaset_urls: return self.playlist_from_matches( mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) @@ -3076,11 +3105,15 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - peertube_urls = PeerTubeIE._extract_urls(webpage) + peertube_urls = PeerTubeIE._extract_urls(webpage, url) if peertube_urls: return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + teachable_url = TeachableIE._extract_url(webpage, url) + if teachable_url: + return self.url_result(teachable_url) + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) if indavideo_urls: return self.playlist_from_matches( @@ -3091,13 +3124,33 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( apa_urls, video_id, video_title, ie=APAIE.ie_key()) - sharevideos_urls = [mobj.group('url') for mobj in re.finditer( + foxnews_urls = FoxNewsIE._extract_urls(webpage) + if foxnews_urls: + return self.playlist_from_matches( + foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) + + sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer( r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] if sharevideos_urls: return self.playlist_from_matches( sharevideos_urls, video_id, video_title) + viqeo_urls = ViqeoIE._extract_urls(webpage) + if viqeo_urls: + return self.playlist_from_matches( + viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) + + expressen_urls = ExpressenIE._extract_urls(webpage) + if expressen_urls: + return self.playlist_from_matches( + expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) + + zype_urls = ZypeIE._extract_urls(webpage) + if zype_urls: + return self.playlist_from_matches( + zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: @@ -3119,9 +3172,13 @@ class GenericIE(InfoExtractor): jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: - info = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False, base_url=url) - return merge_dicts(info, info_dict) + try: + info = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, base_url=url) + return merge_dicts(info, info_dict) + except ExtractorError: + # See https://github.com/rg3/youtube-dl/pull/16735 + pass # Video.js embed mobj = re.search( diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py index a0670b645..c1b36a59b 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/youtube_dl/extractor/gfycat.py @@ -53,7 +53,7 @@ class GfycatIE(InfoExtractor): video_id = self._match_id(url) gfy = self._download_json( - 'http://gfycat.com/cajax/get/%s' % video_id, + 'https://api.gfycat.com/v1/gfycats/%s' % video_id, video_id, 'Downloading video info') if 'error' in gfy: raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index c2140c362..fb8f7679b 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -72,7 +72,7 @@ class GloboIE(InfoExtractor): return try: - self._download_json( + glb_id = (self._download_json( 'https://login.globo.com/api/authentication', None, data=json.dumps({ 'payload': { 'email': email, @@ -81,7 +81,9 @@ class GloboIE(InfoExtractor): }, }).encode(), headers={ 'Content-Type': 'application/json; charset=utf-8', - }) + }) or {}).get('glbId') + if glb_id: + self._set_cookie('.globo.com', 'GLBID', glb_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: resp = self._parse_json(e.cause.read(), None) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index e781405f2..206d89e82 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -25,18 +25,19 @@ class GoIE(AdobePassIE): }, 'watchdisneychannel': { 'brand': '004', - 'requestor_id': 'Disney', + 'resource_id': 'Disney', }, 'watchdisneyjunior': { 'brand': '008', - 'requestor_id': 'DisneyJunior', + 'resource_id': 'DisneyJunior', }, 'watchdisneyxd': { 'brand': '009', - 'requestor_id': 'DisneyXD', + 'resource_id': 'DisneyXD', } } - _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) + _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))'\ + % '|'.join(list(_SITE_INFO.keys()) + ['disneynow']) _TESTS = [{ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { @@ -62,6 +63,14 @@ class GoIE(AdobePassIE): }, { 'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland', 'only_matching': True, + }, { + # brand 004 + 'url': 'http://disneynow.go.com/shows/big-hero-6-the-series/season-01/episode-10-mr-sparkles-loses-his-sparkle/vdka4637915', + 'only_matching': True, + }, { + # brand 008 + 'url': 'http://disneynow.go.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013', + 'only_matching': True, }] def _extract_videos(self, brand, video_id='-1', show_id='-1'): @@ -72,14 +81,23 @@ class GoIE(AdobePassIE): def _real_extract(self, url): sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() - site_info = self._SITE_INFO[sub_domain] - brand = site_info['brand'] - if not video_id: - webpage = self._download_webpage(url, display_id) + site_info = self._SITE_INFO.get(sub_domain, {}) + brand = site_info.get('brand') + if not video_id or not site_info: + webpage = self._download_webpage(url, display_id or video_id) video_id = self._search_regex( # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None) + r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', + default=None) + if not site_info: + brand = self._search_regex( + (r'data-brand=\s*["\']\s*(\d+)', + r'data-page-brand=\s*["\']\s*(\d+)'), webpage, 'brand', + default='004') + site_info = next( + si for _, si in self._SITE_INFO.items() + if si.get('brand') == brand) if not video_id: # show extraction works for Disney, DisneyJunior and DisneyXD # ABC and Freeform has different layout @@ -112,8 +130,8 @@ class GoIE(AdobePassIE): 'device': '001', } if video_data.get('accesslevel') == '1': - requestor_id = site_info['requestor_id'] - resource = self._get_mvpd_resource( + requestor_id = site_info.get('requestor_id', 'DisneyChannels') + resource = site_info.get('resource_id') or self._get_mvpd_resource( requestor_id, title, video_id, None) auth = self._extract_mvpd_auth( url, video_id, requestor_id, resource) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 35dde42d0..c3ea717bc 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -14,8 +15,8 @@ from ..utils import ( class Go90IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?go90\.com/videos/(?P<id>[0-9a-zA-Z]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?go90\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'https://www.go90.com/videos/84BUqjLpf9D', 'md5': 'efa7670dbbbf21a7b07b360652b24a32', 'info_dict': { @@ -27,15 +28,31 @@ class Go90IE(InfoExtractor): 'upload_date': '20170411', 'age_limit': 14, } - } + }, { + 'url': 'https://www.go90.com/embed/261MflWkD3N', + 'only_matching': True, + }] + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://www.go90.com/api/view/items/' + video_id, - video_id, headers={ + + try: + headers = self.geo_verification_headers() + headers.update({ 'Content-Type': 'application/json; charset=utf-8', - }, data=b'{"client":"web","device_type":"pc"}') + }) + video_data = self._download_json( + 'https://www.go90.com/api/view/items/' + video_id, video_id, + headers=headers, data=b'{"client":"web","device_type":"pc"}') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + message = self._parse_json(e.cause.read().decode(), None)['error']['message'] + if 'region unavailable' in message: + self.raise_geo_restricted(countries=['US']) + raise ExtractorError(message, expected=True) + raise + if video_data.get('requires_drm'): raise ExtractorError('This video is DRM protected.', expected=True) main_video_asset = video_data['main_video_asset'] diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index 39fabe8a5..f26f80265 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -8,6 +8,7 @@ from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + url_or_none, urlencode_postdata, ) @@ -80,8 +81,8 @@ class HiDiveIE(InfoExtractor): bitrates = rendition.get('bitrates') if not isinstance(bitrates, dict): continue - m3u8_url = bitrates.get('hls') - if not isinstance(m3u8_url, compat_str): + m3u8_url = url_or_none(bitrates.get('hls')) + if not m3u8_url: continue formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', @@ -93,9 +94,8 @@ class HiDiveIE(InfoExtractor): if not isinstance(cc_file, list) or len(cc_file) < 3: continue cc_lang = cc_file[0] - cc_url = cc_file[2] - if not isinstance(cc_lang, compat_str) or not isinstance( - cc_url, compat_str): + cc_url = url_or_none(cc_file[2]) + if not isinstance(cc_lang, compat_str) or not cc_url: continue subtitles.setdefault(cc_lang, []).append({ 'url': cc_url, diff --git a/youtube_dl/extractor/hketv.py b/youtube_dl/extractor/hketv.py new file mode 100644 index 000000000..b57927fc1 --- /dev/null +++ b/youtube_dl/extractor/hketv.py @@ -0,0 +1,191 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + merge_dicts, + parse_count, + str_or_none, + try_get, + unified_strdate, + urlencode_postdata, + urljoin, +) + + +class HKETVIE(InfoExtractor): + IE_NAME = 'hketv' + IE_DESC = '香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['HK'] + _VALID_URL = r'https?://(?:www\.)?hkedcity\.net/etv/resource/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.hkedcity.net/etv/resource/2932360618', + 'md5': 'f193712f5f7abb208ddef3c5ea6ed0b7', + 'info_dict': { + 'id': '2932360618', + 'ext': 'mp4', + 'title': '喜閱一生(共享閱讀樂) (中、英文字幕可供選擇)', + 'description': 'md5:d5286d05219ef50e0613311cbe96e560', + 'upload_date': '20181024', + 'duration': 900, + 'subtitles': 'count:2', + }, + 'skip': 'Geo restricted to HK', + }, { + 'url': 'https://www.hkedcity.net/etv/resource/972641418', + 'md5': '1ed494c1c6cf7866a8290edad9b07dc9', + 'info_dict': { + 'id': '972641418', + 'ext': 'mp4', + 'title': '衣冠楚楚 (天使系列之一)', + 'description': 'md5:10bb3d659421e74f58e5db5691627b0f', + 'upload_date': '20070109', + 'duration': 907, + 'subtitles': {}, + }, + 'params': { + 'geo_verification_proxy': '<HK proxy here>', + }, + 'skip': 'Geo restricted to HK', + }] + + _CC_LANGS = { + '中文(繁體中文)': 'zh-Hant', + '中文(简体中文)': 'zh-Hans', + 'English': 'en', + 'Bahasa Indonesia': 'id', + '\u0939\u093f\u0928\u094d\u0926\u0940': 'hi', + '\u0928\u0947\u092a\u093e\u0932\u0940': 'ne', + 'Tagalog': 'tl', + '\u0e44\u0e17\u0e22': 'th', + '\u0627\u0631\u062f\u0648': 'ur', + } + _FORMAT_HEIGHTS = { + 'SD': 360, + 'HD': 720, + } + _APPS_BASE_URL = 'https://apps.hkedcity.net' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = ( + self._html_search_meta( + ('ed_title', 'search.ed_title'), webpage, default=None) or + self._search_regex( + r'data-favorite_title_(?:eng|chi)=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'title', default=None, group='url') or + self._html_search_regex( + r'<h1>([^<]+)</h1>', webpage, 'title', default=None) or + self._og_search_title(webpage) + ) + + file_id = self._search_regex( + r'post_var\[["\']file_id["\']\s*\]\s*=\s*(.+?);', + webpage, 'file ID') + curr_url = self._search_regex( + r'post_var\[["\']curr_url["\']\s*\]\s*=\s*"(.+?)";', + webpage, 'curr URL') + data = { + 'action': 'get_info', + 'curr_url': curr_url, + 'file_id': file_id, + 'video_url': file_id, + } + + response = self._download_json( + self._APPS_BASE_URL + '/media/play/handler.php', video_id, + data=urlencode_postdata(data), + headers=merge_dicts({ + 'Content-Type': 'application/x-www-form-urlencoded'}, + self.geo_verification_headers())) + + result = response['result'] + + if not response.get('success') or not response.get('access'): + error = clean_html(response.get('access_err_msg')) + if 'Video streaming is not available in your country' in error: + self.raise_geo_restricted( + msg=error, countries=self._GEO_COUNTRIES) + else: + raise ExtractorError(error, expected=True) + + formats = [] + + width = int_or_none(result.get('width')) + height = int_or_none(result.get('height')) + + playlist0 = result['playlist'][0] + for fmt in playlist0['sources']: + file_url = urljoin(self._APPS_BASE_URL, fmt.get('file')) + if not file_url: + continue + # If we ever wanted to provide the final resolved URL that + # does not require cookies, albeit with a shorter lifespan: + # urlh = self._downloader.urlopen(file_url) + # resolved_url = urlh.geturl() + label = fmt.get('label') + h = self._FORMAT_HEIGHTS.get(label) + w = h * width // height if h and width and height else None + formats.append({ + 'format_id': label, + 'ext': fmt.get('type'), + 'url': file_url, + 'width': w, + 'height': h, + }) + self._sort_formats(formats) + + subtitles = {} + tracks = try_get(playlist0, lambda x: x['tracks'], list) or [] + for track in tracks: + if not isinstance(track, dict): + continue + track_kind = str_or_none(track.get('kind')) + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): + continue + track_url = urljoin(self._APPS_BASE_URL, track.get('file')) + if not track_url: + continue + track_label = track.get('label') + subtitles.setdefault(self._CC_LANGS.get( + track_label, track_label), []).append({ + 'url': self._proto_relative_url(track_url), + 'ext': 'srt', + }) + + # Likes + emotion = self._download_json( + 'https://emocounter.hkedcity.net/handler.php', video_id, + data=urlencode_postdata({ + 'action': 'get_emotion', + 'data[bucket_id]': 'etv', + 'data[identifier]': video_id, + }), + headers={'Content-Type': 'application/x-www-form-urlencoded'}, + fatal=False) or {} + like_count = int_or_none(try_get( + emotion, lambda x: x['data']['emotion_data'][0]['count'])) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta( + 'description', webpage, fatal=False), + 'upload_date': unified_strdate(self._html_search_meta( + 'ed_date', webpage, fatal=False), day_first=False), + 'duration': int_or_none(result.get('length')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': urljoin(self._APPS_BASE_URL, result.get('image')), + 'view_count': parse_count(result.get('view_count')), + 'like_count': like_count, + } diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index d28af36ec..8de9c4faf 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -1,49 +1,56 @@ # coding: utf-8 from __future__ import unicode_literals -import re +import hashlib +import hmac +import time from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( determine_ext, ExtractorError, int_or_none, + try_get, ) class HotStarBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['IN'] + _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' - def _download_json(self, *args, **kwargs): - response = super(HotStarBaseIE, self)._download_json(*args, **kwargs) - if response['resultCode'] != 'OK': - if kwargs.get('fatal'): - raise ExtractorError( - response['errorDescription'], expected=True) - return None - return response['resultObj'] - - def _download_content_info(self, content_id): - return self._download_json( - 'https://account.hotstar.com/AVS/besc', content_id, query={ - 'action': 'GetAggregatedContentDetails', - 'appVersion': '5.0.40', - 'channel': 'PCTV', - 'contentId': content_id, - })['contentInfo'][0] + def _call_api(self, path, video_id, query_name='contentId'): + st = int(time.time()) + exp = st + 6000 + auth = 'st=%d~exp=%d~acl=/*' % (st, exp) + auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest() + response = self._download_json( + 'https://api.hotstar.com/' + path, + video_id, headers={ + 'hotstarauth': auth, + 'x-country-code': 'IN', + 'x-platform-code': 'JIO', + }, query={ + query_name: video_id, + 'tas': 10000, + }) + if response['statusCode'] != 'OK': + raise ExtractorError( + response['body']['message'], expected=True) + return response['body']['results'] class HotStarIE(HotStarBaseIE): + IE_NAME = 'hotstar' _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})' _TESTS = [{ - 'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273', + # contentData + 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', 'info_dict': { 'id': '1000076273', 'ext': 'mp4', - 'title': 'On Air With AIB', + 'title': 'Can You Not Spread Rumours?', 'description': 'md5:c957d8868e9bc793ccb813691cc4c434', - 'timestamp': 1447227000, + 'timestamp': 1447248600, 'upload_date': '20151111', 'duration': 381, }, @@ -51,6 +58,10 @@ class HotStarIE(HotStarBaseIE): # m3u8 download 'skip_download': True, } + }, { + # contentDetail + 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', + 'only_matching': True, }, { 'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583', 'only_matching': True, @@ -58,47 +69,52 @@ class HotStarIE(HotStarBaseIE): 'url': 'http://www.hotstar.com/1000000515', 'only_matching': True, }] + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_content_info(video_id) + webpage = self._download_webpage(url, video_id) + app_state = self._parse_json(self._search_regex( + r'<script>window\.APP_STATE\s*=\s*({.+?})</script>', + webpage, 'app state'), video_id) + video_data = {} + getters = list( + lambda x, k=k: x['initialState']['content%s' % k]['content'] + for k in ('Data', 'Detail') + ) + for v in app_state.values(): + content = try_get(v, getters, dict) + if content and content.get('contentId') == video_id: + video_data = content + break - title = video_data['episodeTitle'] + title = video_data['title'] - if video_data.get('encrypted') == 'Y': + if video_data.get('drmProtected'): raise ExtractorError('This video is DRM protected.', expected=True) formats = [] - for f in ('JIO',): - format_data = self._download_json( - 'http://getcdn.hotstar.com/AVS/besc', - video_id, 'Downloading %s JSON metadata' % f, - fatal=False, query={ - 'action': 'GetCDN', - 'asJson': 'Y', - 'channel': f, - 'id': video_id, - 'type': 'VOD', - }) - if format_data: - format_url = format_data.get('src') - if not format_url: - continue - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - # produce broken files - continue - else: - formats.append({ - 'url': format_url, - 'width': int_or_none(format_data.get('width')), - 'height': int_or_none(format_data.get('height')), - }) + format_data = self._call_api('h/v1/play', video_id)['item'] + format_url = format_data['playbackUrl'] + ext = determine_ext(format_url) + if ext == 'm3u8': + try: + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted(countries=['IN']) + raise + elif ext == 'f4m': + # produce broken files + pass + else: + formats.append({ + 'url': format_url, + 'width': int_or_none(format_data.get('width')), + 'height': int_or_none(format_data.get('height')), + }) self._sort_formats(formats) return { @@ -106,57 +122,43 @@ class HotStarIE(HotStarBaseIE): 'title': title, 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), - 'timestamp': int_or_none(video_data.get('broadcastDate')), + 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), 'formats': formats, + 'channel': video_data.get('channelName'), + 'channel_id': video_data.get('channelId'), + 'series': video_data.get('showName'), + 'season': video_data.get('seasonName'), + 'season_number': int_or_none(video_data.get('seasonNo')), + 'season_id': video_data.get('seasonId'), 'episode': title, - 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'series': video_data.get('contentTitle'), + 'episode_number': int_or_none(video_data.get('episodeNo')), } class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com/tv/[^/]+/(?P<content_id>\d+))/(?P<type>[^/]+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' _TESTS = [{ - 'url': 'http://www.hotstar.com/tv/pratidaan/14982/episodes/14812/9993', + 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { - 'id': '14812', + 'id': '3_2_26', }, - 'playlist_mincount': 75, + 'playlist_mincount': 20, }, { - 'url': 'http://www.hotstar.com/tv/pratidaan/14982/popular-clips/9998/9998', + 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, }] - _ITEM_TYPES = { - 'episodes': 'EPISODE', - 'popular-clips': 'CLIPS', - } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base_url = mobj.group('url') - content_id = mobj.group('content_id') - playlist_type = mobj.group('type') + playlist_id = self._match_id(url) - content_info = self._download_content_info(content_id) - playlist_id = compat_str(content_info['categoryId']) - - collection = self._download_json( - 'https://search.hotstar.com/AVS/besc', playlist_id, query={ - 'action': 'SearchContents', - 'appVersion': '5.0.40', - 'channel': 'PCTV', - 'moreFilters': 'series:%s;' % playlist_id, - 'query': '*', - 'searchOrder': 'last_broadcast_date desc,year desc,title asc', - 'type': self._ITEM_TYPES.get(playlist_type, 'EPISODE'), - }) + collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId') entries = [ self.url_result( - '%s/_/%s' % (base_url, video['contentId']), + 'https://www.hotstar.com/%s' % video['contentId'], ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in collection['response']['docs'] + for video in collection['assets']['items'] if video.get('contentId')] return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/hungama.py b/youtube_dl/extractor/hungama.py new file mode 100644 index 000000000..3fdaac5b6 --- /dev/null +++ b/youtube_dl/extractor/hungama.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + urlencode_postdata, +) + + +class HungamaIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?hungama\.com/ + (?: + (?:video|movie)/[^/]+/| + tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'http://www.hungama.com/video/krishna-chants/39349649/', + 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', + 'info_dict': { + 'id': '2931166', + 'ext': 'mp4', + 'title': 'Lucky Ali - Kitni Haseen Zindagi', + 'track': 'Kitni Haseen Zindagi', + 'artist': 'Lucky Ali', + 'album': 'Aks', + 'release_year': 2000, + } + }, { + 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', + 'only_matching': True, + }, { + 'url': 'https://www.hungama.com/tv-show/padded-ki-pushup/season-1/44139461/episode/ep-02-training-sasu-pathlaag-karing/44139503/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info = self._search_json_ld(webpage, video_id) + + m3u8_url = self._download_json( + 'https://www.hungama.com/index.php', video_id, + data=urlencode_postdata({'content_id': video_id}), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'X-Requested-With': 'XMLHttpRequest', + }, query={ + 'c': 'common', + 'm': 'get_video_mdn_url', + })['stream_url'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + info.update({ + 'id': video_id, + 'formats': formats, + }) + return info + + +class HungamaSongIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/', + 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', + 'info_dict': { + 'id': '2931166', + 'ext': 'mp4', + 'title': 'Lucky Ali - Kitni Haseen Zindagi', + 'track': 'Kitni Haseen Zindagi', + 'artist': 'Lucky Ali', + 'album': 'Aks', + 'release_year': 2000, + } + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + + data = self._download_json( + 'https://www.hungama.com/audio-player-data/track/%s' % audio_id, + audio_id, query={'_country': 'IN'})[0] + + track = data['song_name'] + artist = data.get('singer_name') + + m3u8_url = self._download_json( + data.get('file') or data['preview_link'], + audio_id)['response']['media_url'] + + formats = self._extract_m3u8_formats( + m3u8_url, audio_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + title = '%s - %s' % (artist, track) if artist else track + thumbnail = data.get('img_src') or data.get('album_image') + + return { + 'id': audio_id, + 'title': title, + 'thumbnail': thumbnail, + 'track': track, + 'artist': artist, + 'album': data.get('album_name'), + 'release_year': int_or_none(data.get('date')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 4bafa54a2..436759da5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, mimetype2ext, parse_duration, qualities, + url_or_none, ) @@ -61,10 +61,11 @@ class ImdbIE(InfoExtractor): for encoding in video_metadata.get('encodings', []): if not encoding or not isinstance(encoding, dict): continue - video_url = encoding.get('videoUrl') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(encoding.get('videoUrl')) + if not video_url: continue - ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType'))) + ext = mimetype2ext(encoding.get( + 'mimeType')) or determine_ext(video_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8_native', diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 2901960a5..a5ba03efa 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -12,7 +12,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -20,35 +20,23 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/A61SaA1', - 'info_dict': { - 'id': 'A61SaA1', - 'ext': 'mp4', - 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The magic of the Internet', - }, - }, { - 'url': 'https://imgur.com/gallery/YcAQlkx', - 'info_dict': { - 'id': 'YcAQlkx', - 'ext': 'mp4', - 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - } - }, { - 'url': 'http://imgur.com/topic/Funny/N8rOudd', 'only_matching': True, }, { - 'url': 'http://imgur.com/r/aww/VQcQPhM', + 'url': 'https://i.imgur.com/crGpqCV.mp4', + 'only_matching': True, + }, { + # no title + 'url': 'https://i.imgur.com/jxBXAMC.gifv', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - gifv_url = 'https://i.imgur.com/{id}.gifv'.format(id=video_id) - webpage = self._download_webpage(gifv_url, video_id) + webpage = self._download_webpage( + 'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id) width = int_or_none(self._og_search_property( 'video:width', webpage, default=None)) @@ -69,7 +57,6 @@ class ImgurIE(InfoExtractor): 'format_id': m.group('type').partition('/')[2], 'url': self._proto_relative_url(m.group('src')), 'ext': mimetype2ext(m.group('type')), - 'acodec': 'none', 'width': width, 'height': height, 'http_headers': { @@ -104,44 +91,64 @@ class ImgurIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'description': self._og_search_description(webpage, default=None), - 'title': self._og_search_title(webpage), + 'title': self._og_search_title(webpage, default=video_id), } -class ImgurAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P<id>[a-zA-Z0-9]{5})(?:[/?#&]+)?$' +class ImgurGalleryIE(InfoExtractor): + IE_NAME = 'imgur:gallery' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://imgur.com/gallery/Q95ko', 'info_dict': { 'id': 'Q95ko', + 'title': 'Adding faces make every GIF better', }, 'playlist_count': 25, }, { - 'url': 'http://imgur.com/a/j6Orj', + 'url': 'http://imgur.com/topic/Aww/ll5Vk', 'only_matching': True, }, { - 'url': 'http://imgur.com/topic/Aww/ll5Vk', + 'url': 'https://imgur.com/gallery/YcAQlkx', + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + } + }, { + 'url': 'http://imgur.com/topic/Funny/N8rOudd', + 'only_matching': True, + }, { + 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, }] def _real_extract(self, url): - album_id = self._match_id(url) + gallery_id = self._match_id(url) - album_images = self._download_json( - 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, - album_id, fatal=False) + data = self._download_json( + 'https://imgur.com/gallery/%s.json' % gallery_id, + gallery_id)['data']['image'] - if album_images: - data = album_images.get('data') - if data and isinstance(data, dict): - images = data.get('images') - if images and isinstance(images, list): - entries = [ - self.url_result('http://imgur.com/%s' % image['hash']) - for image in images if image.get('hash')] - return self.playlist_result(entries, album_id) + if data.get('is_album'): + entries = [ + self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash']) + for image in data['album_images']['images'] if image.get('hash')] + return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description')) - # Fallback to single video - return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key()) + return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) + + +class ImgurAlbumIE(ImgurGalleryIE): + IE_NAME = 'imgur:album' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'http://imgur.com/a/j6Orj', + 'info_dict': { + 'id': 'j6Orj', + 'title': 'A Literary Analysis of "Star Wars: The Force Awakens"', + }, + 'playlist_count': 12, + }] diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 0c13f54ee..ffd87b55f 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -17,6 +17,7 @@ from ..utils import ( lowercase_escape, std_headers, try_get, + url_or_none, ) @@ -170,7 +171,7 @@ class InstagramIE(InfoExtractor): node = try_get(edge, lambda x: x['node'], dict) if not node: continue - node_video_url = try_get(node, lambda x: x['video_url'], compat_str) + node_video_url = url_or_none(node.get('video_url')) if not node_video_url: continue entries.append({ @@ -226,44 +227,37 @@ class InstagramIE(InfoExtractor): } -class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' - IE_DESC = 'Instagram user profile' - IE_NAME = 'instagram:user' - _TEST = { - 'url': 'https://instagram.com/porsche', - 'info_dict': { - 'id': 'porsche', - 'title': 'porsche', - }, - 'playlist_count': 5, - 'params': { - 'extract_flat': True, - 'skip_download': True, - 'playlistend': 5, - } - } +class InstagramPlaylistIE(InfoExtractor): + # A superclass for handling any kind of query based on GraphQL which + # results in a playlist. - _gis_tmpl = None + _gis_tmpl = None # used to cache GIS request type - def _entries(self, data): + def _parse_graphql(self, webpage, item_id): + # Reads a webpage and returns its GraphQL data. + return self._parse_json( + self._search_regex( + r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), + item_id) + + def _extract_graphql(self, data, url): + # Parses GraphQL queries containing videos and generates a playlist. def get_count(suffix): return int_or_none(try_get( node, lambda x: x['edge_media_' + suffix]['count'])) - uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + uploader_id = self._match_id(url) csrf_token = data['config']['csrf_token'] rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' - self._set_cookie('instagram.com', 'ig_pr', '1') - cursor = '' for page_num in itertools.count(1): - variables = json.dumps({ - 'id': uploader_id, + variables = { 'first': 12, 'after': cursor, - }) + } + variables.update(self._query_vars_for(data)) + variables = json.dumps(variables) if self._gis_tmpl: gis_tmpls = [self._gis_tmpl] @@ -275,21 +269,26 @@ class InstagramUserIE(InfoExtractor): '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), ] + # try all of the ways to generate a GIS query, and not only use the + # first one that works, but cache it for future requests for gis_tmpl in gis_tmpls: try: - media = self._download_json( + json_data = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, 'Downloading JSON page %d' % page_num, headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-Instagram-GIS': hashlib.md5( ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), }, query={ - 'query_hash': '42323d64886122307be10013ad2dcc44', + 'query_hash': self._QUERY_HASH, 'variables': variables, - })['data']['user']['edge_owner_to_timeline_media'] + }) + media = self._parse_timeline_from(json_data) self._gis_tmpl = gis_tmpl break except ExtractorError as e: + # if it's an error caused by a bad query, and there are + # more GIS templates to try, ignore it and keep trying if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if gis_tmpl != gis_tmpls[-1]: continue @@ -347,14 +346,80 @@ class InstagramUserIE(InfoExtractor): break def _real_extract(self, url): - username = self._match_id(url) + user_or_tag = self._match_id(url) + webpage = self._download_webpage(url, user_or_tag) + data = self._parse_graphql(webpage, user_or_tag) - webpage = self._download_webpage(url, username) - - data = self._parse_json( - self._search_regex( - r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), - username) + self._set_cookie('instagram.com', 'ig_pr', '1') return self.playlist_result( - self._entries(data), username, username) + self._extract_graphql(data, url), user_or_tag, user_or_tag) + + +class InstagramUserIE(InstagramPlaylistIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' + IE_DESC = 'Instagram user profile' + IE_NAME = 'instagram:user' + _TEST = { + 'url': 'https://instagram.com/porsche', + 'info_dict': { + 'id': 'porsche', + 'title': 'porsche', + }, + 'playlist_count': 5, + 'params': { + 'extract_flat': True, + 'skip_download': True, + 'playlistend': 5, + } + } + + _QUERY_HASH = '42323d64886122307be10013ad2dcc44', + + @staticmethod + def _parse_timeline_from(data): + # extracts the media timeline data from a GraphQL result + return data['data']['user']['edge_owner_to_timeline_media'] + + @staticmethod + def _query_vars_for(data): + # returns a dictionary of variables to add to the timeline query based + # on the GraphQL of the original page + return { + 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + } + + +class InstagramTagIE(InstagramPlaylistIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' + IE_DESC = 'Instagram hashtag search' + IE_NAME = 'instagram:tag' + _TEST = { + 'url': 'https://instagram.com/explore/tags/lolcats', + 'info_dict': { + 'id': 'lolcats', + 'title': 'lolcats', + }, + 'playlist_count': 50, + 'params': { + 'extract_flat': True, + 'skip_download': True, + 'playlistend': 50, + } + } + + _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', + + @staticmethod + def _parse_timeline_from(data): + # extracts the media timeline data from a GraphQL result + return data['data']['hashtag']['edge_hashtag_to_media'] + + @staticmethod + def _query_vars_for(data): + # returns a dictionary of variables to add to the timeline query based + # on the GraphQL of the original page + return { + 'tag_name': + data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] + } diff --git a/youtube_dl/extractor/internazionale.py b/youtube_dl/extractor/internazionale.py index 10ba1f6cf..676e8e269 100644 --- a/youtube_dl/extractor/internazionale.py +++ b/youtube_dl/extractor/internazionale.py @@ -7,7 +7,7 @@ from ..utils import unified_timestamp class InternazionaleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?internazionale\.it/video/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.internazionale.it/video/2015/02/19/richard-linklater-racconta-una-scena-di-boyhood', 'md5': '3e39d32b66882c1218e305acbf8348ca', 'info_dict': { @@ -23,7 +23,23 @@ class InternazionaleIE(InfoExtractor): 'params': { 'format': 'bestvideo', }, - } + }, { + 'url': 'https://www.internazionale.it/video/2018/08/29/telefono-stare-con-noi-stessi', + 'md5': '9db8663704cab73eb972d1cee0082c79', + 'info_dict': { + 'id': '761344', + 'display_id': 'telefono-stare-con-noi-stessi', + 'ext': 'mp4', + 'title': 'Usiamo il telefono per evitare di stare con noi stessi', + 'description': 'md5:75ccfb0d6bcefc6e7428c68b4aa1fe44', + 'timestamp': 1535528954, + 'upload_date': '20180829', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'format': 'bestvideo', + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -40,8 +56,13 @@ class InternazionaleIE(InfoExtractor): DATA_RE % 'job-id', webpage, 'video id', group='value') video_path = self._search_regex( DATA_RE % 'video-path', webpage, 'video path', group='value') + video_available_abroad = self._search_regex( + DATA_RE % 'video-available_abroad', webpage, + 'video available aboard', default='1', group='value') + video_available_abroad = video_available_abroad == '1' - video_base = 'https://video.internazionale.it/%s/%s.' % (video_path, video_id) + video_base = 'https://video%s.internazionale.it/%s/%s.' % \ + ('' if video_available_abroad else '-ita', video_path, video_id) formats = self._extract_m3u8_formats( video_base + 'm3u8', display_id, 'mp4', diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index a29e6a5ba..11bbeb592 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -12,7 +12,7 @@ from ..utils import ( class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://play\.iprima\.cz/(?:.+/)?(?P<id>[^?#]+)' + _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' _GEO_BYPASS = False _TESTS = [{ @@ -33,14 +33,45 @@ class IPrimaIE(InfoExtractor): # geo restricted 'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1', 'only_matching': True, + }, { + # iframe api.play-backend.iprima.cz + 'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2', + 'only_matching': True, + }, { + # iframe prima.iprima.cz + 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', + 'only_matching': True, + }, { + 'url': 'http://www.iprima.cz/filmy/desne-rande', + 'only_matching': True, + }, { + 'url': 'https://zoom.iprima.cz/10-nejvetsich-tajemstvi-zahad/posvatna-mista-a-stavby', + 'only_matching': True, + }, { + 'url': 'https://krimi.iprima.cz/mraz-0/sebevrazdy', + 'only_matching': True, + }, { + 'url': 'https://cool.iprima.cz/derava-silnice-nevadi', + 'only_matching': True, + }, { + 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi', + 'only_matching': True, + }, { + 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + self._set_cookie('play.iprima.cz', 'ott_adult_confirmed', '1') + webpage = self._download_webpage(url, video_id) - video_id = self._search_regex(r'data-product="([^"]+)">', webpage, 'real id') + video_id = self._search_regex( + (r'<iframe[^>]+\bsrc=["\'](?:https?:)?//(?:api\.play-backend\.iprima\.cz/prehravac/embedded|prima\.iprima\.cz/[^/]+/[^/]+)\?.*?\bid=(p\d+)', + r'data-product="([^"]+)">'), + webpage, 'real id') playerpage = self._download_webpage( 'http://play.iprima.cz/prehravac/init', diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 6a4f8a505..de65b6bb4 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -13,15 +13,17 @@ from ..compat import ( compat_etree_register_namespace, ) from ..utils import ( + determine_ext, + ExtractorError, extract_attributes, + int_or_none, + merge_dicts, + parse_duration, + smuggle_url, + url_or_none, xpath_with_ns, xpath_element, xpath_text, - int_or_none, - parse_duration, - smuggle_url, - ExtractorError, - determine_ext, ) @@ -129,64 +131,65 @@ class ITVIE(InfoExtractor): resp_env = self._download_xml( params['data-playlist-url'], video_id, - headers=headers, data=etree.tostring(req_env)) - playlist = xpath_element(resp_env, './/Playlist') - if playlist is None: - fault_code = xpath_text(resp_env, './/faultcode') - fault_string = xpath_text(resp_env, './/faultstring') - if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted( - msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code not in ( - 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, fault_string), expected=True) - info.update({ - 'title': self._og_search_title(webpage), - 'episode_title': params.get('data-video-episode'), - 'series': params.get('data-video-title'), - }) - else: - title = xpath_text(playlist, 'EpisodeTitle', default=None) - info.update({ - 'title': title, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duration': parse_duration(xpath_text(playlist, 'Duration')), - }) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] + headers=headers, data=etree.tostring(req_env), fatal=False) + if resp_env: + playlist = xpath_element(resp_env, './/Playlist') + if playlist is None: + fault_code = xpath_text(resp_env, './/faultcode') + fault_string = xpath_text(resp_env, './/faultstring') + if fault_code == 'InvalidGeoRegion': + self.raise_geo_restricted( + msg=fault_string, countries=self._GEO_COUNTRIES) + elif fault_code not in ( + 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, fault_string), expected=True) + info.update({ + 'title': self._og_search_title(webpage), + 'episode_title': params.get('data-video-episode'), + 'series': params.get('data-video-title'), + }) + else: + title = xpath_text(playlist, 'EpisodeTitle', default=None) + info.update({ + 'title': title, + 'episode_title': title, + 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), + 'series': xpath_text(playlist, 'ProgrammeTitle'), + 'duration': parse_duration(xpath_text(playlist, 'Duration')), + }) + video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) + media_files = xpath_element(video_element, 'MediaFiles', fatal=True) + rtmp_url = media_files.attrib['base'] - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) + for media_file in media_files.findall('MediaFile'): + play_path = xpath_text(media_file, 'URL') + if not play_path: + continue + tbr = int_or_none(media_file.get('bitrate'), 1000) + f = { + 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), + 'play_path': play_path, + # Providing this swfVfy allows to avoid truncated downloads + 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', + 'page_url': url, + 'tbr': tbr, + 'ext': 'flv', + } + app = self._search_regex( + 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) + if app: + f.update({ + 'url': rtmp_url.split('?', 1)[0], + 'app': app, + }) + else: + f['url'] = rtmp_url + formats.append(f) - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if caption_url.text: - extract_subtitle(caption_url.text) + for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): + if caption_url.text: + extract_subtitle(caption_url.text) ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') hmac = params.get('data-video-hmac') @@ -248,8 +251,8 @@ class ITVIE(InfoExtractor): for sub in subs: if not isinstance(sub, dict): continue - href = sub.get('Href') - if isinstance(href, compat_str): + href = url_or_none(sub.get('Href')) + if href: extract_subtitle(href) if not info.get('duration'): info['duration'] = parse_duration(video_data.get('Duration')) @@ -261,7 +264,17 @@ class ITVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, }) - return info + + webpage_info = self._search_json_ld(webpage, video_id, default={}) + if not webpage_info.get('title'): + webpage_info['title'] = self._html_search_regex( + r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or webpage_info['episode'] + + return merge_dicts(info, webpage_info) class ITVBTCCIE(InfoExtractor): diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index cb51cef2d..86c014b07 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -15,7 +15,7 @@ from ..utils import ( class IviIE(InfoExtractor): IE_DESC = 'ivi.ru' IE_NAME = 'ivi' - _VALID_URL = r'https?://(?:www\.)?ivi\.ru/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)' _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] @@ -65,7 +65,11 @@ class IviIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', - } + }, + { + 'url': 'https://www.ivi.tv/watch/33560/', + 'only_matching': True, + }, ] # Sorted by quality diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index a7514fc80..907d5fc8b 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, mimetype2ext, remove_end, + url_or_none, ) @@ -73,11 +74,14 @@ class IwaraIE(InfoExtractor): formats = [] for a_format in video_data: + format_uri = url_or_none(a_format.get('uri')) + if not format_uri: + continue format_id = a_format.get('resolution') height = int_or_none(self._search_regex( r'(\d+)p', format_id, 'height', default=None)) formats.append({ - 'url': a_format['uri'], + 'url': self._proto_relative_url(format_uri, 'https:'), 'format_id': format_id, 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', 'height': height, diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 595d7a5b7..c21827618 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -26,8 +26,15 @@ class JamendoBaseIE(InfoExtractor): class JamendoIE(JamendoBaseIE): - _VALID_URL = r'https?://(?:www\.)?jamendo\.com/track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+)' - _TEST = { + _VALID_URL = r'''(?x) + https?:// + (?: + licensing\.jamendo\.com/[^/]+| + (?:www\.)?jamendo\.com + ) + /track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+) + ''' + _TESTS = [{ 'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i', 'md5': '6e9e82ed6db98678f171c25a8ed09ffd', 'info_dict': { @@ -40,14 +47,19 @@ class JamendoIE(JamendoBaseIE): 'duration': 210, 'thumbnail': r're:^https?://.*\.jpg' } - } + }, { + 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', + 'only_matching': True, + }] def _real_extract(self, url): mobj = self._VALID_URL_RE.match(url) track_id = mobj.group('id') display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) + webpage = self._download_webpage( + 'https://www.jamendo.com/track/%s/%s' % (track_id, display_id), + display_id) title, artist, track = self._extract_meta(webpage) diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index a764023e9..62b28e980 100644 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -18,7 +18,7 @@ class JojIE(InfoExtractor): joj:| https?://media\.joj\.sk/embed/ ) - (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + (?P<id>[^/?#^]+) ''' _TESTS = [{ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', @@ -29,16 +29,24 @@ class JojIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 3118, } + }, { + 'url': 'https://media.joj.sk/embed/9i1cxv', + 'only_matching': True, }, { 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', 'only_matching': True, + }, { + 'url': 'joj:9i1cxv', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - webpage) + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', + webpage)] def _real_extract(self, url): video_id = self._match_id(url) @@ -53,7 +61,7 @@ class JojIE(InfoExtractor): bitrates = self._parse_json( self._search_regex( - r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates', + r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates', default='{}'), video_id, transform_source=js_to_json, fatal=False) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 63d0dc998..d19a6a774 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -7,8 +7,8 @@ from .common import InfoExtractor class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' - _TEST = { + _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|video|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', 'info_dict': { @@ -19,7 +19,10 @@ class JWPlatformIE(InfoExtractor): 'upload_date': '20081127', 'timestamp': 1227796140, } - } + }, { + 'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js', + 'only_matching': True, + }] @staticmethod def _extract_url(webpage): @@ -34,5 +37,5 @@ class JWPlatformIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) + json_data = self._download_json('https://cdn.jwplayer.com/v2/media/' + video_id, video_id) return self._parse_jwplayer_data(json_data, video_id) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 04f68fce4..fdf7f5bbc 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -192,6 +192,8 @@ class KalturaIE(InfoExtractor): 'entryId': video_id, 'service': 'baseentry', 'ks': '{1:result:ks}', + 'responseProfile:fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId', + 'responseProfile:type': 1, }, { 'action': 'getbyentryid', diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index d4e6f7ac1..c3eb74c17 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -4,16 +4,14 @@ import re from .common import InfoExtractor from ..aes import aes_decrypt_text -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_urllib_parse_unquote from ..utils import ( determine_ext, ExtractorError, int_or_none, str_to_int, strip_or_none, + url_or_none, ) @@ -55,7 +53,8 @@ class KeezMoviesIE(InfoExtractor): encrypted = False def extract_format(format_url, height=None): - if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//')): return if format_url in format_urls: return diff --git a/youtube_dl/extractor/kinopoisk.py b/youtube_dl/extractor/kinopoisk.py new file mode 100644 index 000000000..9e8d01f53 --- /dev/null +++ b/youtube_dl/extractor/kinopoisk.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + dict_get, + int_or_none, +) + + +class KinoPoiskIE(InfoExtractor): + _GEO_COUNTRIES = ['RU'] + _VALID_URL = r'https?://(?:www\.)?kinopoisk\.ru/film/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.kinopoisk.ru/film/81041/watch/', + 'md5': '4f71c80baea10dfa54a837a46111d326', + 'info_dict': { + 'id': '81041', + 'ext': 'mp4', + 'title': 'Алеша попович и тугарин змей', + 'description': 'md5:43787e673d68b805d0aa1df5a5aea701', + 'thumbnail': r're:^https?://.*', + 'duration': 4533, + 'age_limit': 12, + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + 'url': 'https://www.kinopoisk.ru/film/81041', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://ott-widget.kinopoisk.ru/v1/kp/', video_id, + query={'kpId': video_id}) + + data = self._parse_json( + self._search_regex( + r'(?s)<script[^>]+\btype=["\']application/json[^>]+>(.+?)<', + webpage, 'data'), + video_id)['models'] + + film = data['filmStatus'] + title = film.get('title') or film['originalTitle'] + + formats = self._extract_m3u8_formats( + data['playlistEntity']['uri'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = dict_get( + film, ('descriptscription', 'description', + 'shortDescriptscription', 'shortDescription')) + thumbnail = film.get('coverUrl') or film.get('posterUrl') + duration = int_or_none(film.get('duration')) + age_limit = int_or_none(film.get('restrictionAge')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/konserthusetplay.py b/youtube_dl/extractor/konserthusetplay.py index c11cbcf47..dd42bb2f2 100644 --- a/youtube_dl/extractor/konserthusetplay.py +++ b/youtube_dl/extractor/konserthusetplay.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, int_or_none, + url_or_none, ) @@ -109,7 +109,8 @@ class KonserthusetPlayIE(InfoExtractor): captions = source.get('captionsAvailableLanguages') if isinstance(captions, dict): for lang, subtitle_url in captions.items(): - if lang != 'none' and isinstance(subtitle_url, compat_str): + subtitle_url = url_or_none(subtitle_url) + if lang != 'none' and subtitle_url: subtitles.setdefault(lang, []).append({'url': subtitle_url}) return { diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index c7f813370..fa217365a 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor from ..utils import ( @@ -32,7 +33,8 @@ class Laola1TvEmbedIE(InfoExtractor): def _extract_token_url(self, stream_access_url, video_id, data): return self._download_json( - stream_access_url, video_id, headers={ + self._proto_relative_url(stream_access_url, 'https:'), video_id, + headers={ 'Content-Type': 'application/json', }, data=json.dumps(data).encode())['data']['stream-access'][0] @@ -119,9 +121,59 @@ class Laola1TvEmbedIE(InfoExtractor): } -class Laola1TvIE(Laola1TvEmbedIE): +class Laola1TvBaseIE(Laola1TvEmbedIE): + def _extract_video(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + if 'Dieser Livestream ist bereits beendet.' in webpage: + raise ExtractorError('This live stream has already finished.', expected=True) + + conf = self._parse_json(self._search_regex( + r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), + display_id, + transform_source=lambda s: js_to_json(re.sub(r'shareurl:.+,', '', s))) + video_id = conf['videoid'] + + config = self._download_json(conf['configUrl'], video_id, query={ + 'videoid': video_id, + 'partnerid': conf['partnerid'], + 'language': conf.get('language', ''), + 'portal': conf.get('portalid', ''), + }) + error = config.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_data = config['video'] + title = video_data['title'] + is_live = video_data.get('isLivestream') and video_data.get('isLive') + meta = video_data.get('metaInformation') + sports = meta.get('sports') + categories = sports.split(',') if sports else [] + + token_url = self._extract_token_url( + video_data['streamAccess'], video_id, + video_data['abo']['required']) + + formats = self._extract_formats(token_url, video_id) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': self._live_title(title) if is_live else title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('image'), + 'categories': categories, + 'formats': formats, + 'is_live': is_live, + } + + +class Laola1TvIE(Laola1TvBaseIE): IE_NAME = 'laola1tv' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -169,52 +221,30 @@ class Laola1TvIE(Laola1TvEmbedIE): }] def _real_extract(self, url): - display_id = self._match_id(url) + return self._extract_video(url) - webpage = self._download_webpage(url, display_id) - if 'Dieser Livestream ist bereits beendet.' in webpage: - raise ExtractorError('This live stream has already finished.', expected=True) +class EHFTVIE(Laola1TvBaseIE): + IE_NAME = 'ehftv' + _VALID_URL = r'https?://(?:www\.)?ehftv\.com/[a-z]+(?:-[a-z]+)?/[^/]+/(?P<id>[^/?#&]+)' - conf = self._parse_json(self._search_regex( - r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), - display_id, js_to_json) + _TESTS = [{ + 'url': 'https://www.ehftv.com/int/video/paris-saint-germain-handball-pge-vive-kielce/1166761', + 'info_dict': { + 'id': '1166761', + 'display_id': 'paris-saint-germain-handball-pge-vive-kielce', + 'ext': 'mp4', + 'title': 'Paris Saint-Germain Handball - PGE Vive Kielce', + 'is_live': False, + 'categories': ['Handball'], + }, + 'params': { + 'skip_download': True, + }, + }] - video_id = conf['videoid'] - - config = self._download_json(conf['configUrl'], video_id, query={ - 'videoid': video_id, - 'partnerid': conf['partnerid'], - 'language': conf.get('language', ''), - 'portal': conf.get('portalid', ''), - }) - error = config.get('error') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - video_data = config['video'] - title = video_data['title'] - is_live = video_data.get('isLivestream') and video_data.get('isLive') - meta = video_data.get('metaInformation') - sports = meta.get('sports') - categories = sports.split(',') if sports else [] - - token_url = self._extract_token_url( - video_data['streamAccess'], video_id, - video_data['abo']['required']) - - formats = self._extract_formats(token_url, video_id) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': self._live_title(title) if is_live else title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('image'), - 'categories': categories, - 'formats': formats, - 'is_live': is_live, - } + def _real_extract(self, url): + return self._extract_video(url) class ITTFIE(InfoExtractor): diff --git a/youtube_dl/extractor/lci.py b/youtube_dl/extractor/lci.py index af34829e7..920872f5c 100644 --- a/youtube_dl/extractor/lci.py +++ b/youtube_dl/extractor/lci.py @@ -20,5 +20,7 @@ class LCIIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - wat_id = self._search_regex(r'data-watid=[\'"](\d+)', webpage, 'wat id') + wat_id = self._search_regex( + (r'data-watid=[\'"](\d+)', r'idwat["\']?\s*:\s*["\']?(\d+)'), + webpage, 'wat id') return self.url_result('wat:' + wat_id, 'Wat', wat_id) diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py new file mode 100644 index 000000000..24f78d928 --- /dev/null +++ b/youtube_dl/extractor/lecturio.py @@ -0,0 +1,229 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + extract_attributes, + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class LecturioBaseIE(InfoExtractor): + _LOGIN_URL = 'https://app.lecturio.com/en/login' + _NETRC_MACHINE = 'lecturio' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + # Sets some cookies + _, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(url_handle): + return self._LOGIN_URL not in compat_str(url_handle.geturl()) + + # Already logged in + if is_logged(urlh): + return + + login_form = { + 'signin[email]': username, + 'signin[password]': password, + 'signin[remember]': 'on', + } + + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + # Logged in successfully + if is_logged(urlh): + return + + errors = self._html_search_regex( + r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response, + 'errors', default=None) + if errors: + raise ExtractorError('Unable to login: %s' % errors, expected=True) + raise ExtractorError('Unable to log in') + + +class LecturioIE(LecturioBaseIE): + _VALID_URL = r'''(?x) + https:// + (?: + app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.lecture| + (?:www\.)?lecturio\.de/[^/]+/(?P<id_de>[^/?#&]+)\.vortrag + ) + ''' + _TESTS = [{ + 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', + 'md5': 'f576a797a5b7a5e4e4bbdfc25a6a6870', + 'info_dict': { + 'id': '39634', + 'ext': 'mp4', + 'title': 'Important Concepts and Terms – Introduction to Microbiology', + }, + 'skip': 'Requires lecturio account credentials', + }, { + 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', + 'only_matching': True, + }] + + _CC_LANGS = { + 'German': 'de', + 'English': 'en', + 'Spanish': 'es', + 'French': 'fr', + 'Polish': 'pl', + 'Russian': 'ru', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') or mobj.group('id_de') + + webpage = self._download_webpage( + 'https://app.lecturio.com/en/lecture/%s/player.html' % display_id, + display_id) + + lecture_id = self._search_regex( + r'lecture_id\s*=\s*(?:L_)?(\d+)', webpage, 'lecture id') + + api_url = self._search_regex( + r'lectureDataLink\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'api url', group='url') + + video = self._download_json(api_url, display_id) + + title = video['title'].strip() + + formats = [] + for format_ in video['content']['media']: + if not isinstance(format_, dict): + continue + file_ = format_.get('file') + if not file_: + continue + ext = determine_ext(file_) + if ext == 'smil': + # smil contains only broken RTMP formats anyway + continue + file_url = url_or_none(file_) + if not file_url: + continue + label = str_or_none(format_.get('label')) + filesize = int_or_none(format_.get('fileSize')) + formats.append({ + 'url': file_url, + 'format_id': label, + 'filesize': float_or_none(filesize, invscale=1000) + }) + self._sort_formats(formats) + + subtitles = {} + automatic_captions = {} + cc = self._parse_json( + self._search_regex( + r'subtitleUrls\s*:\s*({.+?})\s*,', webpage, 'subtitles', + default='{}'), display_id, fatal=False) + for cc_label, cc_url in cc.items(): + cc_url = url_or_none(cc_url) + if not cc_url: + continue + lang = self._search_regex( + r'/([a-z]{2})_', cc_url, 'lang', + default=cc_label.split()[0] if cc_label else 'en') + original_lang = self._search_regex( + r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang', + default=None) + sub_dict = (automatic_captions + if 'auto-translated' in cc_label or original_lang + else subtitles) + sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ + 'url': cc_url, + }) + + return { + 'id': lecture_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + } + + +class LecturioCourseIE(LecturioBaseIE): + _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.course' + _TEST = { + 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', + 'info_dict': { + 'id': 'microbiology-introduction', + 'title': 'Microbiology: Introduction', + }, + 'playlist_count': 45, + 'skip': 'Requires lecturio account credentials', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)<[^>]+\bdata-url=(["\'])(?:(?!\1).)+\.lecture\b[^>]+>', + webpage): + params = extract_attributes(mobj.group(0)) + lecture_url = urljoin(url, params.get('data-url')) + lecture_id = params.get('data-id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r'<span[^>]+class=["\']content-title[^>]+>([^<]+)', webpage, + 'title', default=None) + + return self.playlist_result(entries, display_id, title) + + +class LecturioDeCourseIE(LecturioBaseIE): + _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P<id>[^/?#&]+)\.kurs' + _TEST = { + 'url': 'https://www.lecturio.de/jura/grundrechte.kurs', + 'only_matching': True, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)<td[^>]+\bdata-lecture-id=["\'](?P<id>\d+).+?\bhref=(["\'])(?P<url>(?:(?!\2).)+\.vortrag)\b[^>]+>', + webpage): + lecture_url = urljoin(url, mobj.group('url')) + lecture_id = mobj.group('id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r'<h1[^>]*>([^<]+)', webpage, 'title', default=None) + + return self.playlist_result(entries, display_id, title) diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 40295a30b..03f205144 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -16,16 +16,15 @@ from ..utils import ( class LibraryOfCongressIE(InfoExtractor): IE_NAME = 'loc' IE_DESC = 'Library of Congress' - _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9a-z_.]+)' _TESTS = [{ # embedded via <div class="media-player" 'url': 'http://loc.gov/item/90716351/', - 'md5': '353917ff7f0255aa6d4b80a034833de8', + 'md5': '6ec0ae8f07f86731b1b2ff70f046210a', 'info_dict': { 'id': '90716351', 'ext': 'mp4', 'title': "Pa's trip to Mars", - 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 0, 'view_count': int, }, @@ -57,6 +56,12 @@ class LibraryOfCongressIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.loc.gov/item/ihas.200197114/', + 'only_matching': True, + }, { + 'url': 'https://www.loc.gov/item/afc1981005_afs20503/', + 'only_matching': True, }] def _real_extract(self, url): @@ -67,12 +72,13 @@ class LibraryOfCongressIE(InfoExtractor): (r'id=(["\'])media-player-(?P<id>.+?)\1', r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1', r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1', - r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1'), + r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1', + r'data-tab="share-media-(?P<id>[0-9A-F]{32})"'), webpage, 'media id', group='id') data = self._download_json( 'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id, - video_id)['mediaObject'] + media_id)['mediaObject'] derivative = data['derivatives'][0] media_url = derivative['derivativeUrl'] @@ -89,25 +95,29 @@ class LibraryOfCongressIE(InfoExtractor): if ext not in ('mp4', 'mp3'): media_url += '.mp4' if is_video else '.mp3' - if 'vod/mp4:' in media_url: - formats = [{ - 'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8', + formats = [] + if '/vod/mp4:' in media_url: + formats.append({ + 'url': media_url.replace('/vod/mp4:', '/hls-vod/media/') + '.m3u8', 'format_id': 'hls', 'ext': 'mp4', 'protocol': 'm3u8_native', 'quality': 1, - }] - elif 'vod/mp3:' in media_url: - formats = [{ - 'url': media_url.replace('vod/mp3:', ''), - 'vcodec': 'none', - }] + }) + http_format = { + 'url': re.sub(r'(://[^/]+/)(?:[^/]+/)*(?:mp4|mp3):', r'\1', media_url), + 'format_id': 'http', + 'quality': 1, + } + if not is_video: + http_format['vcodec'] = 'none' + formats.append(http_format) download_urls = set() for m in re.finditer( r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?: |\s+)\((?P<size>.+?)\))?\s*<', webpage): format_id = m.group('id').lower() - if format_id == 'gif': + if format_id in ('gif', 'jpeg'): continue download_url = m.group('url') if download_url in download_urls: diff --git a/youtube_dl/extractor/linkedin.py b/youtube_dl/extractor/linkedin.py new file mode 100644 index 000000000..5a86b0064 --- /dev/null +++ b/youtube_dl/extractor/linkedin.py @@ -0,0 +1,181 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + urlencode_postdata, +) + + +class LinkedInLearningBaseIE(InfoExtractor): + _NETRC_MACHINE = 'linkedin' + + def _call_api(self, course_slug, fields, video_slug=None, resolution=None): + query = { + 'courseSlug': course_slug, + 'fields': fields, + 'q': 'slugs', + } + sub = '' + if video_slug: + query.update({ + 'videoSlug': video_slug, + 'resolution': '_%s' % resolution, + }) + sub = ' %dp' % resolution + api_url = 'https://www.linkedin.com/learning-api/detailedCourses' + return self._download_json( + api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ + 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, + }, query=query)['elements'][0] + + def _get_urn_id(self, video_data): + urn = video_data.get('urn') + if urn: + mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn) + if mobj: + return mobj.group(1) + + def _get_video_id(self, video_data, course_slug, video_slug): + return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + login_page = self._download_webpage( + 'https://www.linkedin.com/uas/login?trk=learning', + None, 'Downloading login page') + action_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', + default='https://www.linkedin.com/uas/login-submit', group='url') + data = self._hidden_inputs(login_page) + data.update({ + 'session_key': email, + 'session_password': password, + }) + login_submit_page = self._download_webpage( + action_url, None, 'Logging in', + data=urlencode_postdata(data)) + error = self._search_regex( + r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>', + login_submit_page, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + +class LinkedInLearningIE(LinkedInLearningBaseIE): + IE_NAME = 'linkedin:learning' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<course_slug>[^/]+)/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true', + 'md5': 'a1d74422ff0d5e66a792deb996693167', + 'info_dict': { + 'id': '90426', + 'ext': 'mp4', + 'title': 'Welcome', + 'timestamp': 1430396150.82, + 'upload_date': '20150430', + }, + } + + def _real_extract(self, url): + course_slug, video_slug = re.match(self._VALID_URL, url).groups() + + video_data = None + formats = [] + for width, height in ((640, 360), (960, 540), (1280, 720)): + video_data = self._call_api( + course_slug, 'selectedVideo', video_slug, height)['selectedVideo'] + + video_url_data = video_data.get('url') or {} + progressive_url = video_url_data.get('progressiveUrl') + if progressive_url: + formats.append({ + 'format_id': 'progressive-%dp' % height, + 'url': progressive_url, + 'height': height, + 'width': width, + 'source_preference': 1, + }) + + title = video_data['title'] + + audio_url = video_data.get('audio', {}).get('progressiveUrl') + if audio_url: + formats.append({ + 'abr': 64, + 'ext': 'm4a', + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + + streaming_url = video_url_data.get('streamingUrl') + if streaming_url: + formats.extend(self._extract_m3u8_formats( + streaming_url, video_slug, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr')) + + return { + 'id': self._get_video_id(video_data, course_slug, video_slug), + 'title': title, + 'formats': formats, + 'thumbnail': video_data.get('defaultThumbnail'), + 'timestamp': float_or_none(video_data.get('publishedOn'), 1000), + 'duration': int_or_none(video_data.get('durationInSeconds')), + } + + +class LinkedInLearningCourseIE(LinkedInLearningBaseIE): + IE_NAME = 'linkedin:learning:course' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals', + 'info_dict': { + 'id': 'programming-foundations-fundamentals', + 'title': 'Programming Foundations: Fundamentals', + 'description': 'md5:76e580b017694eb89dc8e8923fff5c86', + }, + 'playlist_mincount': 61, + } + + @classmethod + def suitable(cls, url): + return False if LinkedInLearningIE.suitable(url) else super(LinkedInLearningCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_slug = self._match_id(url) + course_data = self._call_api(course_slug, 'chapters,description,title') + + entries = [] + for chapter_number, chapter in enumerate(course_data.get('chapters', []), 1): + chapter_title = chapter.get('title') + chapter_id = self._get_urn_id(chapter) + for video in chapter.get('videos', []): + video_slug = video.get('slug') + if not video_slug: + continue + entries.append({ + '_type': 'url_transparent', + 'id': self._get_video_id(video, course_slug, video_slug), + 'title': video.get('title'), + 'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug), + 'chapter': chapter_title, + 'chapter_number': chapter_number, + 'chapter_id': chapter_id, + 'ie_key': LinkedInLearningIE.ie_key(), + }) + + return self.playlist_result( + entries, course_slug, + course_data.get('title'), + course_data.get('description')) diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py new file mode 100644 index 000000000..a78c6556e --- /dev/null +++ b/youtube_dl/extractor/linuxacademy.py @@ -0,0 +1,174 @@ +from __future__ import unicode_literals + +import json +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + orderedSet, + unescapeHTML, + urlencode_postdata, + urljoin, +) + + +class LinuxAcademyIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?linuxacademy\.com/cp/ + (?: + courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)| + modules/view/id/(?P<course_id>\d+) + ) + ''' + _TESTS = [{ + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154', + 'info_dict': { + 'id': '1498-2', + 'ext': 'mp4', + 'title': "Introduction to the Practitioner's Brief", + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires Linux Academy account credentials', + }, { + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', + 'only_matching': True, + }, { + 'url': 'https://linuxacademy.com/cp/modules/view/id/154', + 'info_dict': { + 'id': '154', + 'title': 'AWS Certified Cloud Practitioner', + 'description': 'md5:039db7e60e4aac9cf43630e0a75fa834', + }, + 'playlist_count': 41, + 'skip': 'Requires Linux Academy account credentials', + }] + + _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' + _ORIGIN_URL = 'https://linuxacademy.com' + _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' + _NETRC_MACHINE = 'linuxacademy' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + def random_string(): + return ''.join([ + random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') + for _ in range(32)]) + + webpage, urlh = self._download_webpage_handle( + self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ + 'client_id': self._CLIENT_ID, + 'response_type': 'token id_token', + 'redirect_uri': self._ORIGIN_URL, + 'scope': 'openid email user_impersonation profile', + 'audience': self._ORIGIN_URL, + 'state': random_string(), + 'nonce': random_string(), + }) + + login_data = self._parse_json( + self._search_regex( + r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'login info', group='value'), None, + transform_source=lambda x: compat_b64decode(x).decode('utf-8') + )['extraParams'] + + login_data.update({ + 'client_id': self._CLIENT_ID, + 'redirect_uri': self._ORIGIN_URL, + 'tenant': 'lacausers', + 'connection': 'Username-Password-Authentication', + 'username': username, + 'password': password, + 'sso': 'true', + }) + + login_state_url = compat_str(urlh.geturl()) + + try: + login_page = self._download_webpage( + 'https://login.linuxacademy.com/usernamepassword/login', None, + 'Downloading login page', data=json.dumps(login_data).encode(), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://login.linuxacademy.com', + 'Referer': login_state_url, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read(), None) + message = error.get('description') or error['code'] + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) + raise + + callback_page, urlh = self._download_webpage_handle( + 'https://login.linuxacademy.com/login/callback', None, + 'Downloading callback page', + data=urlencode_postdata(self._hidden_inputs(login_page)), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Origin': 'https://login.linuxacademy.com', + 'Referer': login_state_url, + }) + + access_token = self._search_regex( + r'access_token=([^=&]+)', compat_str(urlh.geturl()), + 'access token') + + self._download_webpage( + 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' + % access_token, None, 'Downloading token validation page') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') + item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) + + webpage = self._download_webpage(url, item_id) + + # course path + if course_id: + entries = [ + self.url_result( + urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key()) + for lesson_url in orderedSet(re.findall( + r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)', + webpage))] + title = unescapeHTML(self._html_search_regex( + (r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)', + r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), + webpage, 'title', default=None, group='value')) + description = unescapeHTML(self._html_search_regex( + r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'description', default=None, group='value')) + return self.playlist_result(entries, course_id, title, description) + + # single video path + info = self._extract_jwplayer_data( + webpage, item_id, require_title=False, m3u8_id='hls',) + title = self._search_regex( + (r'>Lecture\s*:\s*(?P<value>[^<]+)', + r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'title', group='value') + info.update({ + 'id': item_id, + 'title': title, + }) + return info diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 26671753c..22a067e40 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -87,7 +87,7 @@ class LiveLeakIE(InfoExtractor): @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"', + r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"', webpage) def _real_extract(self, url): @@ -120,13 +120,27 @@ class LiveLeakIE(InfoExtractor): } for idx, info_dict in enumerate(entries): + formats = [] for a_format in info_dict['formats']: if not a_format.get('height'): a_format['height'] = int_or_none(self._search_regex( r'([0-9]+)p\.mp4', a_format['url'], 'height label', default=None)) + formats.append(a_format) - self._sort_formats(info_dict['formats']) + # Removing '.*.mp4' gives the raw video, which is essentially + # the same video without the LiveLeak logo at the top (see + # https://github.com/rg3/youtube-dl/pull/4768) + orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url']) + if a_format['url'] != orig_url: + format_id = a_format.get('format_id') + formats.append({ + 'format_id': 'original' + ('-' + format_id if format_id else ''), + 'url': orig_url, + 'preference': 1, + }) + self._sort_formats(formats) + info_dict['formats'] = formats # Don't append entry ID for one-video pages to keep backward compatibility if len(entries) > 1: @@ -146,7 +160,7 @@ class LiveLeakIE(InfoExtractor): class LiveLeakEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[if])=(?P<id>[\w_]+)' + _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[ift])=(?P<id>[\w_]+)' # See generic.py for actual test cases _TESTS = [{ @@ -158,15 +172,14 @@ class LiveLeakEmbedIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - kind, video_id = mobj.group('kind', 'id') + kind, video_id = re.match(self._VALID_URL, url).groups() if kind == 'f': webpage = self._download_webpage(url, video_id) liveleak_url = self._search_regex( - r'logourl\s*:\s*(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL, + r'(?:logourl\s*:\s*|window\.open\()(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL, webpage, 'LiveLeak URL', group='url') - elif kind == 'i': - liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id + else: + liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id) return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index c4776bbf3..e55b1a202 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -363,7 +363,4 @@ class LivestreamShortenerIE(InfoExtractor): id = mobj.group('id') webpage = self._download_webpage(url, id) - return { - '_type': 'url', - 'url': self._og_search_url(webpage), - } + return self.url_result(self._og_search_url(webpage)) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index f5c7abc13..3084c6dff 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_str, compat_urlparse, ) @@ -16,7 +15,7 @@ from ..utils import ( class LyndaBaseIE(InfoExtractor): - _SIGNIN_URL = 'https://www.lynda.com/signin' + _SIGNIN_URL = 'https://www.lynda.com/signin/lynda' _PASSWORD_URL = 'https://www.lynda.com/signin/password' _USER_URL = 'https://www.lynda.com/signin/user' _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' @@ -44,21 +43,15 @@ class LyndaBaseIE(InfoExtractor): form_data = self._hidden_inputs(form_html) form_data.update(extra_form_data) - try: - response = self._download_json( - action_url, None, note, - data=urlencode_postdata(form_data), - headers={ - 'Referer': referrer_url, - 'X-Requested-With': 'XMLHttpRequest', - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: - response = self._parse_json(e.cause.read().decode('utf-8'), None) - self._check_error(response, ('email', 'password')) - raise + response = self._download_json( + action_url, None, note, + data=urlencode_postdata(form_data), + headers={ + 'Referer': referrer_url, + 'X-Requested-With': 'XMLHttpRequest', + }, expected_status=(418, 500, )) - self._check_error(response, 'ErrorMessage') + self._check_error(response, ('email', 'password', 'ErrorMessage')) return response, action_url diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py new file mode 100644 index 000000000..e13c2e11a --- /dev/null +++ b/youtube_dl/extractor/malltv.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import merge_dicts + + +class MallTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mall\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', + 'md5': '1c4a37f080e1f3023103a7b43458e518', + 'info_dict': { + 'id': 't0zzt0', + 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', + 'ext': 'mp4', + 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', + 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', + 'duration': 216, + 'timestamp': 1538870400, + 'upload_date': '20181007', + 'view_count': int, + } + }, { + 'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage( + url, display_id, headers=self.geo_verification_headers()) + + SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b' + video_id = self._search_regex( + SOURCE_RE, webpage, 'video id', group='id') + + media = self._parse_html5_media_entries( + url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, + m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] + + info = self._search_json_ld(webpage, video_id, default={}) + + return merge_dicts(media, info, { + 'id': video_id, + 'display_id': display_id, + 'title': self._og_search_title(webpage, default=None) or display_id, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + }) diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py index b94b3c2ab..e8d7163e4 100644 --- a/youtube_dl/extractor/manyvids.py +++ b/youtube_dl/extractor/manyvids.py @@ -2,12 +2,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + determine_ext, + int_or_none, + str_to_int, + urlencode_postdata, +) class ManyVidsIE(InfoExtractor): _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)' - _TEST = { + _TESTS = [{ + # preview video 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', 'info_dict': { @@ -17,7 +23,18 @@ class ManyVidsIE(InfoExtractor): 'view_count': int, 'like_count': int, }, - } + }, { + # full video + 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', + 'md5': 'f3e8f7086409e9b470e2643edb96bdcc', + 'info_dict': { + 'id': '935718', + 'ext': 'mp4', + 'title': 'MY FACE REVEAL', + 'view_count': int, + 'like_count': int, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -28,12 +45,41 @@ class ManyVidsIE(InfoExtractor): r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'video URL', group='url') - title = '%s (Preview)' % self._html_search_regex( - r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') + title = self._html_search_regex( + (r'<span[^>]+class=["\']item-title[^>]+>([^<]+)', + r'<h2[^>]+class=["\']h2 m-0["\'][^>]*>([^<]+)'), + webpage, 'title', default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', fatal=True) + + if any(p in webpage for p in ('preview_videos', '_preview.mp4')): + title += ' (Preview)' + + mv_token = self._search_regex( + r'data-mvtoken=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'mv token', default=None, group='value') + + if mv_token: + # Sets some cookies + self._download_webpage( + 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', + video_id, fatal=False, data=urlencode_postdata({ + 'mvtoken': mv_token, + 'vid': video_id, + }), headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest' + }) + + if determine_ext(video_url) == 'm3u8': + formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + formats = [{'url': video_url}] like_count = int_or_none(self._search_regex( r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) - view_count = int_or_none(self._html_search_regex( + view_count = str_to_int(self._html_search_regex( r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage, 'view count', default=None)) @@ -42,7 +88,5 @@ class ManyVidsIE(InfoExtractor): 'title': title, 'view_count': view_count, 'like_count': like_count, - 'formats': [{ - 'url': video_url, - }], + 'formats': formats, } diff --git a/youtube_dl/extractor/markiza.py b/youtube_dl/extractor/markiza.py new file mode 100644 index 000000000..def960a0c --- /dev/null +++ b/youtube_dl/extractor/markiza.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + orderedSet, + parse_duration, + try_get, +) + + +class MarkizaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?videoarchiv\.markiza\.sk/(?:video/(?:[^/]+/)*|embed/)(?P<id>\d+)(?:[_/]|$)' + _TESTS = [{ + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723_oteckovia-109', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139078', + 'ext': 'mp4', + 'title': 'Oteckovia 109', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2760, + }, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/televizne-noviny/televizne-noviny/85430_televizne-noviny', + 'info_dict': { + 'id': '85430', + 'title': 'Televízne noviny', + }, + 'playlist_count': 23, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/filmy/85190_kamenak', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/reflex/zo-zakulisia/84651_pribeh-alzbetky', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/embed/85295', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'http://videoarchiv.markiza.sk/json/video_jwplayer7.json', + video_id, query={'id': video_id}) + + info = self._parse_jwplayer_data(data, m3u8_id='hls', mpd_id='dash') + + if info.get('_type') == 'playlist': + info.update({ + 'id': video_id, + 'title': try_get( + data, lambda x: x['details']['name'], compat_str), + }) + else: + info['duration'] = parse_duration( + try_get(data, lambda x: x['details']['duration'], compat_str)) + return info + + +class MarkizaPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:(?:[^/]+\.)?markiza|tvnoviny)\.sk/(?:[^/]+/)*(?P<id>\d+)_' + _TESTS = [{ + 'url': 'http://www.markiza.sk/soubiz/zahranicny/1923705_oteckovia-maju-svoj-den-ti-slavni-nie-su-o-nic-menej-rozkosni', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139355', + 'ext': 'mp4', + 'title': 'Oteckovia 110', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2604, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://dajto.markiza.sk/filmy-a-serialy/1774695_frajeri-vo-vegas', + 'only_matching': True, + }, { + 'url': 'http://superstar.markiza.sk/aktualne/1923870_to-je-ale-telo-spevacka-ukazala-sexy-postavicku-v-bikinach', + 'only_matching': True, + }, { + 'url': 'http://hybsa.markiza.sk/aktualne/1923790_uzasna-atmosfera-na-hybsa-v-poprade-superstaristi-si-prve-koncerty-pred-davom-ludi-poriadne-uzili', + 'only_matching': True, + }, { + 'url': 'http://doma.markiza.sk/filmy/1885250_moja-vysnivana-svadba', + 'only_matching': True, + }, { + 'url': 'http://www.tvnoviny.sk/domace/1923887_po-smrti-manzela-ju-cakalo-poriadne-prekvapenie', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MarkizaIE.suitable(url) else super(MarkizaPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage( + # Downloading for some hosts (e.g. dajto, doma) fails with 500 + # although everything seems to be OK, so considering 500 + # status code to be expected. + url, playlist_id, expected_status=500) + + entries = [ + self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id) + for video_id in orderedSet(re.findall( + r'(?:initPlayer_|data-entity=["\']|id=["\']player_)(\d+)', + webpage))] + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 9760eafd5..df3748798 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -3,116 +3,161 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..compat import compat_str +from .theplatform import ThePlatformBaseIE +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) from ..utils import ( - determine_ext, - parse_duration, - try_get, - unified_strdate, + ExtractorError, + int_or_none, + update_url_query, ) -class MediasetIE(InfoExtractor): +class MediasetIE(ThePlatformBaseIE): + _TP_TLD = 'eu' _VALID_URL = r'''(?x) (?: mediaset:| https?:// - (?:www\.)?video\.mediaset\.it/ + (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: (?:video|on-demand)/(?:[^/]+/)+[^/]+_| - player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= + player/index\.html\?.*?\bprogramGuid= ) - )(?P<id>[0-9]+) + )(?P<id>[0-9A-Z]{16}) ''' _TESTS = [{ # full episode - 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', + 'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824', 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', 'info_dict': { - 'id': '661824', + 'id': 'FAFU000000661824', 'ext': 'mp4', 'title': 'Quarta puntata', - 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1414, - 'creator': 'mediaset', + 'duration': 1414.26, 'upload_date': '20161107', 'series': 'Hello Goodbye', - 'categories': ['reality'], + 'timestamp': 1478532900, + 'uploader': 'Rete 4', + 'uploader_id': 'R4', }, - 'expected_warnings': ['is not a supported codec'], + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', + 'md5': '288532f0ad18307705b01e581304cd7b', + 'info_dict': { + 'id': 'F309013801000501', + 'ext': 'mp4', + 'title': 'Puntata del 25 maggio', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 6565.007, + 'upload_date': '20180526', + 'series': 'Matrix', + 'timestamp': 1527326245, + 'uploader': 'Canale 5', + 'uploader_id': 'C5', + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip - 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', + 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', 'only_matching': True, }, { # iframe simple - 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924', 'only_matching': True, }, { # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) - 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', 'only_matching': True, }, { - 'url': 'mediaset:661824', + 'url': 'mediaset:FAFU000000665924', 'only_matching': True, }] @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1', - webpage)] + def _extract_urls(ie, webpage): + def _qs(url): + return compat_parse_qs(compat_urllib_parse_urlparse(url).query) + + def _program_guid(qs): + return qs.get('programGuid', [None])[0] + + entries = [] + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml.*?)\1', + webpage): + embed_url = mobj.group('url') + embed_qs = _qs(embed_url) + program_guid = _program_guid(embed_qs) + if program_guid: + entries.append(embed_url) + continue + video_id = embed_qs.get('id', [None])[0] + if not video_id: + continue + urlh = ie._request_webpage( + embed_url, video_id, note='Following embed URL redirect') + embed_url = compat_str(urlh.geturl()) + program_guid = _program_guid(_qs(embed_url)) + if program_guid: + entries.append(embed_url) + return entries def _real_extract(self, url): - video_id = self._match_id(url) - - video_list = self._download_json( - 'http://cdnsel01.mediaset.net/GetCdn.aspx', - video_id, 'Downloading video CDN JSON', query={ - 'streamid': video_id, - 'format': 'json', - })['videoList'] + guid = self._match_id(url) + tp_path = 'PR1GhC/media/guid/2702976343/' + guid + info = self._extract_theplatform_metadata(tp_path, guid) formats = [] - for format_url in video_list: - if '.ism' in format_url: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': determine_ext(format_url), - }) + subtitles = {} + first_e = None + for asset_type in ('SD', 'HD'): + for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'): + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { + 'mbr': 'true', + 'formats': f, + 'assetTypes': asset_type, + }), guid, 'Downloading %s %s SMIL data' % (f, asset_type)) + except ExtractorError as e: + if not first_e: + first_e = e + break + for tp_f in tp_formats: + tp_f['quality'] = 1 if asset_type == 'HD' else 0 + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if first_e and not formats: + raise first_e self._sort_formats(formats) - mediainfo = self._download_json( - 'http://plr.video.mediaset.it/html/metainfo.sjson', - video_id, 'Downloading video info JSON', query={ - 'id': video_id, - })['video'] + fields = [] + for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))): + fields.extend(templ % repl for repl in repls) + feed_data = self._download_json( + 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid, + guid, fatal=False, query={'fields': ','.join(fields)}) + if feed_data: + publish_info = feed_data.get('mediasetprogram$publishInfo') or {} + info.update({ + 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')), + 'season_number': int_or_none(feed_data.get('tvSeasonNumber')), + 'series': feed_data.get('mediasetprogram$brandTitle'), + 'uploader': publish_info.get('description'), + 'uploader_id': publish_info.get('channel'), + 'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')), + }) - title = mediainfo['title'] - - creator = try_get( - mediainfo, lambda x: x['brand-info']['publisher'], compat_str) - category = try_get( - mediainfo, lambda x: x['brand-info']['category'], compat_str) - categories = [category] if category else None - - return { - 'id': video_id, - 'title': title, - 'description': mediainfo.get('short-description'), - 'thumbnail': mediainfo.get('thumbnail'), - 'duration': parse_duration(mediainfo.get('duration')), - 'creator': creator, - 'upload_date': unified_strdate(mediainfo.get('production-date')), - 'webpage_url': mediainfo.get('url'), - 'series': mediainfo.get('brand-value'), - 'categories': categories, + info.update({ + 'id': guid, 'formats': formats, - } + 'subtitles': subtitles, + }) + return info diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 0e2645c55..ef9628e65 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -15,12 +15,13 @@ from ..utils import ( mimetype2ext, unescapeHTML, unsmuggle_url, + url_or_none, urljoin, ) class MediasiteIE(InfoExtractor): - _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/Play/(?P<id>[0-9a-f]{32,34})(?P<query>\?[^#]+|)' + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P<id>[0-9a-f]{32,34})(?P<query>\?[^#]+|)' _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -83,7 +84,15 @@ class MediasiteIE(InfoExtractor): 'timestamp': 1333983600, 'duration': 7794, } - } + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', + 'only_matching': True, + }, + { + 'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d', + 'only_matching': True, + }, ] # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) @@ -156,8 +165,8 @@ class MediasiteIE(InfoExtractor): stream_formats = [] for unum, VideoUrl in enumerate(video_urls): - video_url = VideoUrl.get('Location') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(VideoUrl.get('Location')) + if not video_url: continue # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 42759eae8..40f214a87 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,84 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import uuid - from .common import InfoExtractor -from .ooyala import OoyalaIE -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( int_or_none, - extract_attributes, - determine_ext, smuggle_url, parse_duration, ) -class MiTeleBaseIE(InfoExtractor): - def _get_player_info(self, url, webpage): - player_data = extract_attributes(self._search_regex( - r'(?s)(<ms-video-player.+?</ms-video-player>)', - webpage, 'ms video player')) - video_id = player_data['data-media-id'] - if player_data.get('data-cms-id') == 'ooyala': - return self.url_result( - 'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id) - config_url = compat_urlparse.urljoin(url, player_data['data-config']) - config = self._download_json( - config_url, video_id, 'Downloading config JSON') - mmc_url = config['services']['mmc'] - - duration = None - formats = [] - for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')): - mmc = self._download_json( - m_url, video_id, 'Downloading mmc JSON') - if not duration: - duration = int_or_none(mmc.get('duration')) - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - gcp = location.get('gcp') - ogn = location.get('ogn') - if None in (gat, gcp, ogn): - continue - token_data = { - 'gcp': gcp, - 'ogn': ogn, - 'sta': 0, - } - media = self._download_json( - gat, video_id, data=json.dumps(token_data).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) - stream = media.get('stream') or media.get('file') - if not stream: - continue - ext = determine_ext(stream) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'), - 'duration': duration, - } - - class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player' @@ -86,7 +16,7 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player', 'info_dict': { - 'id': '57b0dfb9c715da65618b4afa', + 'id': 'FhYW1iNTE6J6H7NkQRIEzfne6t2quqPg', 'ext': 'mp4', 'title': 'Tor, la web invisible', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', @@ -104,7 +34,7 @@ class MiTeleIE(InfoExtractor): # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', 'info_dict': { - 'id': '57b0de3dc915da14058b4876', + 'id': 'oyNG1iNTE6TAPP-JmCjbwfwJqqMMX3Vq', 'ext': 'mp4', 'title': 'Cuarto Milenio Temporada 6 Programa 226', 'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f', @@ -128,40 +58,21 @@ class MiTeleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - gigya_url = self._search_regex( - r'<gigya-api>[^>]*</gigya-api>[^>]*<script\s+src="([^"]*)">[^>]*</script>', - webpage, 'gigya', default=None) - gigya_sc = self._download_webpage( - compat_urlparse.urljoin('http://www.mitele.es/', gigya_url), - video_id, 'Downloading gigya script') - - # Get a appKey/uuid for getting the session key - appKey = self._search_regex( - r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)', - gigya_sc, 'appKey') - - session_json = self._download_json( - 'https://appgrid-api.cloud.accedo.tv/session', - video_id, 'Downloading session keys', query={ - 'appKey': appKey, - 'uuid': compat_str(uuid.uuid4()), - }) paths = self._download_json( - 'https://appgrid-api.cloud.accedo.tv/metadata/general_configuration,%20web_configuration', - video_id, 'Downloading paths JSON', - query={'sessionKey': compat_str(session_json['sessionKey'])}) + 'https://www.mitele.es/amd/agp/web/metadata/general_configuration', + video_id, 'Downloading paths JSON') ooyala_s = paths['general_configuration']['api_configuration']['ooyala_search'] + base_url = ooyala_s.get('base_url', 'cdn-search-mediaset.carbyne.ps.ooyala.com') + full_path = ooyala_s.get('full_path', '/search/v1/full/providers/') source = self._download_json( - 'http://%s%s%s/docs/%s' % ( - ooyala_s['base_url'], ooyala_s['full_path'], - ooyala_s['provider_id'], video_id), + '%s://%s%s%s/docs/%s' % ( + ooyala_s.get('protocol', 'https'), base_url, full_path, + ooyala_s.get('provider_id', '104951'), video_id), video_id, 'Downloading data JSON', query={ 'include_titles': 'Series,Season', - 'product_name': 'test', + 'product_name': ooyala_s.get('product_name', 'test'), 'format': 'full', })['hits']['hits'][0]['_source'] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index b7bccb504..bcac13ec5 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -161,11 +161,17 @@ class MixcloudIE(InfoExtractor): stream_info = info_json['streamInfo'] formats = [] + def decrypt_url(f_url): + for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'): + decrypted_url = self._decrypt_xor_cipher(k, f_url) + if re.search(r'^https?://[0-9a-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url): + return decrypted_url + for url_key in ('url', 'hlsUrl', 'dashUrl'): format_url = stream_info.get(url_key) if not format_url: continue - decrypted = self._decrypt_xor_cipher(key, compat_b64decode(format_url)) + decrypted = decrypt_url(compat_b64decode(format_url)) if not decrypted: continue if url_key == 'hlsUrl': diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index e24396e79..d4bd273b6 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -77,8 +77,11 @@ class MotherlessIE(InfoExtractor): title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') - video_url = self._html_search_regex( - r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL') + video_url = (self._html_search_regex( + (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'), + webpage, 'video URL', default=None, group='url') or + 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( r'<strong>Views</strong>\s+([^<]+)<', @@ -120,7 +123,7 @@ class MotherlessIE(InfoExtractor): class MotherlessGroupIE(InfoExtractor): - _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' _TESTS = [{ 'url': 'http://motherless.com/g/movie_scenes', 'info_dict': { @@ -164,9 +167,9 @@ class MotherlessGroupIE(InfoExtractor): if not entries: entries = [ self.url_result( - compat_urlparse.urljoin(base, '/' + video_id), - ie=MotherlessIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( + compat_urlparse.urljoin(base, '/' + entry_id), + ie=MotherlessIE.ie_key(), video_id=entry_id) + for entry_id in orderedSet(re.findall( r'data-codename=["\']([A-Z0-9]+)', webpage))] return entries diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 4d2ee6408..ee12e2b47 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,15 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .adobepass import AdobePassIE -from .theplatform import ThePlatformIE +from .fox import FOXIE from ..utils import ( smuggle_url, url_basename, - update_url_query, - get_element_by_class, ) @@ -66,130 +61,22 @@ class NationalGeographicVideoIE(InfoExtractor): } -class NationalGeographicIE(ThePlatformIE, AdobePassIE): - IE_NAME = 'natgeo' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:(?:wild/)?[^/]+/)?(?:videos|episodes)|u)/(?P<id>[^/?]+)' - - _TESTS = [ - { - 'url': 'http://channel.nationalgeographic.com/u/kdi9Ld0PN2molUUIMSBGxoeDhD729KRjQcnxtetilWPMevo8ZwUBIDuPR0Q3D2LVaTsk0MPRkRWDB8ZhqWVeyoxfsZZm36yRp1j-zPfsHEyI_EgAeFY/', - 'md5': '518c9aa655686cf81493af5cc21e2a04', - 'info_dict': { - 'id': 'vKInpacll2pC', - 'ext': 'mp4', - 'title': 'Uncovering a Universal Knowledge', - 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a', - 'timestamp': 1458680907, - 'upload_date': '20160322', - 'uploader': 'NEWA-FNG-NGTV', - }, - 'add_ie': ['ThePlatform'], +class NationalGeographicTVIE(FOXIE): + _VALID_URL = r'https?://(?:www\.)?nationalgeographic\.com/tv/watch/(?P<id>[\da-fA-F]+)' + _TESTS = [{ + 'url': 'https://www.nationalgeographic.com/tv/watch/6a875e6e734b479beda26438c9f21138/', + 'info_dict': { + 'id': '6a875e6e734b479beda26438c9f21138', + 'ext': 'mp4', + 'title': 'Why Nat Geo? Valley of the Boom', + 'description': 'The lives of prominent figures in the tech world, including their friendships, rivalries, victories and failures.', + 'timestamp': 1542662458, + 'upload_date': '20181119', + 'age_limit': 14, }, - { - 'url': 'http://channel.nationalgeographic.com/u/kdvOstqYaBY-vSBPyYgAZRUL4sWUJ5XUUPEhc7ISyBHqoIO4_dzfY3K6EjHIC0hmFXoQ7Cpzm6RkET7S3oMlm6CFnrQwSUwo/', - 'md5': 'c4912f656b4cbe58f3e000c489360989', - 'info_dict': { - 'id': 'Pok5lWCkiEFA', - 'ext': 'mp4', - 'title': 'The Stunning Red Bird of Paradise', - 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c', - 'timestamp': 1459362152, - 'upload_date': '20160330', - 'uploader': 'NEWA-FNG-NGTV', - }, - 'add_ie': ['ThePlatform'], + 'params': { + 'skip_download': True, }, - { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/', - 'only_matching': True, - }, - { - 'url': 'http://channel.nationalgeographic.com/videos/treasures-rediscovered/', - 'only_matching': True, - }, - { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', - 'only_matching': True, - }, - { - 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - release_url = self._search_regex( - r'video_auth_playlist_url\s*=\s*"([^"]+)"', - webpage, 'release url') - theplatform_path = self._search_regex(r'https?://link\.theplatform\.com/s/([^?]+)', release_url, 'theplatform path') - video_id = theplatform_path.split('/')[-1] - query = { - 'mbr': 'true', - } - is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False) - if is_auth == 'auth': - auth_resource_id = self._search_regex( - r"video_auth_resourceId\s*=\s*'([^']+)'", - webpage, 'auth resource id') - query['auth'] = self._extract_mvpd_auth(url, video_id, 'natgeo', auth_resource_id) - - formats = [] - subtitles = {} - for key, value in (('switch', 'http'), ('manifest', 'm3u')): - tp_query = query.copy() - tp_query.update({ - key: value, - }) - tp_formats, tp_subtitles = self._extract_theplatform_smil( - update_url_query(release_url, tp_query), video_id, 'Downloading %s SMIL data' % value) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - info = self._extract_theplatform_metadata(theplatform_path, display_id) - info.update({ - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - 'display_id': display_id, - }) - return info - - -class NationalGeographicEpisodeGuideIE(InfoExtractor): - IE_NAME = 'natgeo:episodeguide' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?(?P<id>[^/]+)/episode-guide' - _TESTS = [ - { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episode-guide/', - 'info_dict': { - 'id': 'the-story-of-god-with-morgan-freeman-season-1', - 'title': 'The Story of God with Morgan Freeman - Season 1', - }, - 'playlist_mincount': 6, - }, - { - 'url': 'http://channel.nationalgeographic.com/underworld-inc/episode-guide/?s=2', - 'info_dict': { - 'id': 'underworld-inc-season-2', - 'title': 'Underworld, Inc. - Season 2', - }, - 'playlist_mincount': 7, - }, - ] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - show = get_element_by_class('show', webpage) - selected_season = self._search_regex( - r'<div[^>]+class="select-seasons[^"]*".*?<a[^>]*>(.*?)</a>', - webpage, 'selected season') - entries = [ - self.url_result(self._proto_relative_url(entry_url), 'NationalGeographic') - for entry_url in re.findall('(?s)<div[^>]+class="col-inner"[^>]*?>.*?<a[^>]+href="([^"]+)"', webpage)] - return self.playlist_result( - entries, '%s-%s' % (display_id, selected_season.lower().replace(' ', '-')), - '%s - %s' % (show, selected_season)) + }] + _HOME_PAGE_URL = 'https://www.nationalgeographic.com/tv/' + _API_KEY = '238bb0a0c2aba67922c48709ce0c06fd' diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index c843f8649..3282f84ee 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -7,11 +7,10 @@ import re from .common import InfoExtractor from .theplatform import ThePlatformIE from .adobepass import AdobePassIE +from ..compat import compat_urllib_parse_unquote from ..utils import ( - find_xpath_attr, smuggle_url, try_get, - unescapeHTML, update_url_query, int_or_none, ) @@ -75,11 +74,16 @@ class NBCIE(AdobePassIE): 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', 'only_matching': True, }, + { + # Percent escaped url + 'url': 'https://www.nbc.com/up-all-night/video/day-after-valentine%27s-day/n2189', + 'only_matching': True, + } ] def _real_extract(self, url): permalink, video_id = re.match(self._VALID_URL, url).groups() - permalink = 'http' + permalink + permalink = 'http' + compat_urllib_parse_unquote(permalink) response = self._download_json( 'https://api.nbc.com/v3/videos', video_id, query={ 'filter[permalink]': permalink, @@ -263,27 +267,14 @@ class CSNNEIE(InfoExtractor): class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/ - (?:video/.+?/(?P<id>\d+)| - ([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+)) - ''' + _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)' _TESTS = [ - { - 'url': 'http://www.nbcnews.com/video/nbc-news/52753292', - 'md5': '47abaac93c6eaf9ad37ee6c4463a5179', - 'info_dict': { - 'id': '52753292', - 'ext': 'flv', - 'title': 'Crew emerges after four-month Mars food study', - 'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1', - }, - }, { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': 'p_tweet_snow_140529', + 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', @@ -307,7 +298,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': 'nn_netcast_150204', + 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', @@ -320,7 +311,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', 'md5': 'a49e173825e5fcd15c13fc297fced39d', 'info_dict': { - 'id': 'x_lon_vwhorn_150922', + 'id': '529953347624', 'ext': 'mp4', 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', 'description': 'md5:c8be487b2d80ff0594c005add88d8351', @@ -333,7 +324,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', 'md5': '118d7ca3f0bea6534f119c68ef539f71', 'info_dict': { - 'id': 'tdy_al_space_160420', + 'id': '669831235788', 'ext': 'mp4', 'title': 'See the aurora borealis from space in stunning new NASA video', 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', @@ -346,7 +337,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', 'info_dict': { - 'id': 'n_hayes_Aimm_140801_272214', + 'id': '314487875924', 'ext': 'mp4', 'title': 'The chaotic GOP immigration vote', 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', @@ -368,60 +359,22 @@ class NBCNewsIE(ThePlatformIE): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - if video_id is not None: - all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) - info = all_info.find('video') - - return { - 'id': video_id, - 'title': info.find('headline').text, - 'ext': 'flv', - 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, - 'description': info.find('caption').text, - 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, - } - else: - # "feature" and "nightly-news" pages use theplatform.com - video_id = mobj.group('mpx_id') + video_id = self._match_id(url) + if not video_id.isdigit(): webpage = self._download_webpage(url, video_id) - filter_param = 'byId' - bootstrap_json = self._search_regex( - [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', - r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"', - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);'], - webpage, 'bootstrap json', default=None) - if bootstrap_json: - bootstrap = self._parse_json( - bootstrap_json, video_id, transform_source=unescapeHTML) + data = self._parse_json(self._search_regex( + r'window\.__data\s*=\s*({.+});', webpage, + 'bootstrap json'), video_id) + video_id = data['article']['content'][0]['primaryMedia']['video']['mpxMetadata']['id'] - info = None - if 'results' in bootstrap: - info = bootstrap['results'][0]['video'] - elif 'video' in bootstrap: - info = bootstrap['video'] - elif 'msnbcVideoInfo' in bootstrap: - info = bootstrap['msnbcVideoInfo']['meta'] - elif 'msnbcThePlatform' in bootstrap: - info = bootstrap['msnbcThePlatform']['videoPlayer']['video'] - else: - info = bootstrap - - if 'guid' in info: - video_id = info['guid'] - filter_param = 'byGuid' - elif 'mpxId' in info: - video_id = info['mpxId'] - - return { - '_type': 'url_transparent', - 'id': video_id, - # http://feed.theplatform.com/f/2E2eJC/nbcnews also works - 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {filter_param: video_id}), - 'ie_key': 'ThePlatformFeed', - } + return { + '_type': 'url_transparent', + 'id': video_id, + # http://feed.theplatform.com/f/2E2eJC/nbcnews also works + 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {'byId': video_id}), + 'ie_key': 'ThePlatformFeed', + } class NBCOlympicsIE(InfoExtractor): diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 5c8cd76dc..d4acbcc3e 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -5,8 +5,8 @@ from ..utils import ExtractorError class NhkVodIE(InfoExtractor): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P<id>[^/]+/[^/?#&]+)' - _TEST = { + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/(?:vod|ondemand)/(?P<id>[^/]+/[^/?#&]+)' + _TESTS = [{ # Videos available only for a limited period of time. Visit # http://www3.nhk.or.jp/nhkworld/en/vod/ for working samples. 'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815', @@ -19,7 +19,10 @@ class NhkVodIE(InfoExtractor): 'episode': 'The Kimono as Global Fashion', }, 'skip': 'Videos available only for a limited period of time', - } + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', + 'only_matching': True, + }] _API_URL = 'http://api.nhk.or.jp/nhkworld/vodesdlist/v1/all/all/all.json?apikey=EJfK8jdS57GqlupFgAfAAwr573q01y6k' def _real_extract(self, url): diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index dbe871f16..76b412ff1 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -252,7 +252,7 @@ class NiconicoIE(InfoExtractor): }, 'timing_constraint': 'unlimited' } - })) + }).encode()) resolution = video_quality.get('resolution', {}) diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index febef097a..025c5d249 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -31,6 +31,8 @@ class NJPWWorldIE(InfoExtractor): 'skip': 'Requires login', } + _LOGIN_URL = 'https://front.njpwworld.com/auth/login' + def _real_initialize(self): self._login() @@ -40,13 +42,17 @@ class NJPWWorldIE(InfoExtractor): if not username: return True + # Setup session (will set necessary cookies) + self._request_webpage( + 'https://njpwworld.com/', None, note='Setting up session') + webpage, urlh = self._download_webpage_handle( - 'https://njpwworld.com/auth/login', None, + self._LOGIN_URL, None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({'login_id': username, 'pw': password}), - headers={'Referer': 'https://njpwworld.com/auth'}) + headers={'Referer': 'https://front.njpwworld.com/auth'}) # /auth/login will return 302 for successful logins - if urlh.geturl() == 'https://njpwworld.com/auth/login': + if urlh.geturl() == self._LOGIN_URL: self.report_warning('unable to login') return False diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py index 974de3c3e..b40770d07 100644 --- a/youtube_dl/extractor/noovo.py +++ b/youtube_dl/extractor/noovo.py @@ -57,7 +57,8 @@ class NoovoIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - bc_url = BrightcoveNewIE._extract_url(self, webpage) + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') data = self._parse_json( self._search_regex( @@ -89,7 +90,10 @@ class NoovoIE(InfoExtractor): return { '_type': 'url_transparent', 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url(bc_url, {'geo_countries': ['CA']}), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, 'title': title, 'description': description, 'series': series, diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 06cb8cb3f..901f44b54 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -6,28 +6,90 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, + js_to_json, + qualities, unified_strdate, + url_or_none, ) +class NovaEmbedIE(InfoExtractor): + _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', + 'md5': 'b3834f6de5401baabf31ed57456463f7', + 'info_dict': { + 'id': '8o0n0r', + 'ext': 'mp4', + 'title': '2180. díl', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2578, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + bitrates = self._parse_json( + self._search_regex( + r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), + video_id, transform_source=js_to_json) + + QUALITIES = ('lq', 'mq', 'hq', 'hd') + quality_key = qualities(QUALITIES) + + formats = [] + for format_id, format_list in bitrates.items(): + if not isinstance(format_list, list): + continue + for format_url in format_list: + format_url = url_or_none(format_url) + if not format_url: + continue + f = { + 'url': format_url, + } + f_id = format_id + for quality in QUALITIES: + if '%s.mp4' % quality in format_url: + f_id += '-%s' % quality + f.update({ + 'quality': quality_key(quality), + 'format_note': quality.upper(), + }) + break + f['format_id'] = f_id + formats.append(f) + self._sort_formats(formats) + + title = self._og_search_title( + webpage, default=None) or self._search_regex( + (r'<value>(?P<title>[^<]+)', + r'videoTitle\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'title', group='value') + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._search_regex( + r'poster\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='value') + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } + + class NovaIE(InfoExtractor): IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' _TESTS = [{ - 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', - 'info_dict': { - 'id': '1608920', - 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou', - 'ext': 'flv', - 'title': 'Duel: Michal Hrdlička a Petr Suchoň', - 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5', - 'thumbnail': r're:^https?://.*\.(?:jpg)', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', 'info_dict': { @@ -38,33 +100,6 @@ class NovaIE(InfoExtractor): 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', 'thumbnail': r're:^https?://.*\.(?:jpg)', } - }, { - 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove', - 'info_dict': { - 'id': '1756825', - 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', - 'ext': 'flv', - 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', - 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags - 'thumbnail': r're:^https?://.*\.(?:jpg)', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/', - 'info_dict': { - 'id': '1756858', - 'ext': 'flv', - 'title': 'Televizní noviny - 30. 5. 2015', - 'thumbnail': r're:^https?://.*\.(?:jpg)', - 'upload_date': '20150530', - }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', 'info_dict': { @@ -79,6 +114,20 @@ class NovaIE(InfoExtractor): # rtmp download 'skip_download': True, } + }, { + # media.cms.nova.cz embed + 'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil', + 'info_dict': { + 'id': '8o0n0r', + 'ext': 'mp4', + 'title': '2180. díl', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2578, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [NovaEmbedIE.ie_key()], }, { 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', 'only_matching': True, @@ -103,6 +152,15 @@ class NovaIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + # novaplus + embed_id = self._search_regex( + r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)', + webpage, 'embed url', default=None) + if embed_id: + return self.url_result( + 'https://media.cms.nova.cz/embed/%s' % embed_id, + ie=NovaEmbedIE.ie_key(), video_id=embed_id) + video_id = self._search_regex( [r"(?:media|video_id)\s*:\s*'(\d+)'", r'media=(\d+)', @@ -111,8 +169,21 @@ class NovaIE(InfoExtractor): webpage, 'video id') config_url = self._search_regex( - r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', + r'src="(https?://(?:tn|api)\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', webpage, 'config url', default=None) + config_params = {} + + if not config_url: + player = self._parse_json( + self._search_regex( + r'(?s)Player\s*\(.+?\s*,\s*({.+?\bmedia\b["\']?\s*:\s*["\']?\d+.+?})\s*\)', webpage, + 'player', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if player: + config_url = url_or_none(player.get('configUrl')) + params = player.get('configParams') + if isinstance(params, dict): + config_params = params if not config_url: DEFAULT_SITE_ID = '23000' @@ -127,14 +198,20 @@ class NovaIE(InfoExtractor): } site_id = self._search_regex( - r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID) + r'site=(\d+)', webpage, 'site id', default=None) or SITES.get( + site, DEFAULT_SITE_ID) - config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig' - % (site_id, video_id)) + config_url = 'https://api.nova.cz/bin/player/videojs/config.php' + config_params = { + 'site': site_id, + 'media': video_id, + 'quality': 3, + 'version': 1, + } config = self._download_json( config_url, display_id, - 'Downloading config JSON', + 'Downloading config JSON', query=config_params, transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) mediafile = config['mediafile'] diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index cb8319f0d..5a427c396 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -282,7 +282,7 @@ class NPOIE(NPOBaseIE): video_url = stream_info.get('url') if not video_url or video_url in urls: continue - urls.add(item_url) + urls.add(video_url) if determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, ext='mp4', @@ -363,7 +363,7 @@ class NPOIE(NPOBaseIE): class NPOLiveIE(NPOBaseIE): IE_NAME = 'npo.nl:live' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/live(?:/(?P<id>[^/?#&]+))?' + _VALID_URL = r'https?://(?:www\.)?npo(?:start)?\.nl/live(?:/(?P<id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://www.npo.nl/live/npo-1', @@ -380,6 +380,9 @@ class NPOLiveIE(NPOBaseIE): }, { 'url': 'http://www.npo.nl/live', 'only_matching': True, + }, { + 'url': 'https://www.npostart.nl/live/npo-1', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 7157e2390..072f920a9 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,12 +4,18 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( ExtractorError, int_or_none, + JSON_LD_RE, + NO_DEFAULT, parse_age_limit, parse_duration, + try_get, ) @@ -205,13 +211,13 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '2f7f6eeb2aacdd99885f355428715cfa', + 'md5': '706f34cdf1322577589e369e522b50ef', 'info_dict': { 'id': '150533', 'ext': 'mp4', 'title': 'Dompap og andre fugler i Piip-Show', 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 263, + 'duration': 262, } }, { # audio @@ -242,7 +248,7 @@ class NRKTVIE(NRKBaseIE): _VALID_URL = r'''(?x) https?:// (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie/[^/]+|program)/ + (?:serie(?:/[^/]+){1,2}|program)/ (?![Ee]pisodes)%s (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? @@ -250,14 +256,14 @@ class NRKTVIE(NRKBaseIE): _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '4e9ca6629f09e588ed240fb11619922a', + 'md5': '9a167e54d04671eb6317a37b7bc8a280', 'info_dict': { 'id': 'MUHH48000314AA', 'ext': 'mp4', 'title': '20 spørsmål 23.05.2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, - 'series': '20 spørsmål - TV', + 'series': '20 spørsmål', 'episode': '23.05.2014', }, }, { @@ -295,7 +301,7 @@ class NRKTVIE(NRKBaseIE): 'id': 'MSPO40010515AH', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'duration': 772, 'series': 'Tour de Ski', 'episode': '06.01.2015', @@ -308,7 +314,7 @@ class NRKTVIE(NRKBaseIE): 'id': 'MSPO40010515BH', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'duration': 6175, 'series': 'Tour de Ski', 'episode': '06.01.2015', @@ -320,7 +326,7 @@ class NRKTVIE(NRKBaseIE): 'info_dict': { 'id': 'MSPO40010515', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', }, 'expected_warnings': ['Video is geo restricted'], }, { @@ -356,9 +362,207 @@ class NRKTVIE(NRKBaseIE): }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', + 'only_matching': True, }] +class NRKTVEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)' + _TEST = { + 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', + 'info_dict': { + 'id': 'MSUI14000816AA', + 'ext': 'mp4', + 'title': 'Backstage 8:30', + 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', + 'duration': 1320, + 'series': 'Backstage', + 'season_number': 1, + 'episode_number': 8, + 'episode': '8:30', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + nrk_id = self._parse_json( + self._search_regex(JSON_LD_RE, webpage, 'JSON-LD', group='json_ld'), + display_id)['@id'] + + assert re.match(NRKTVIE._EPISODE_RE, nrk_id) + return self.url_result( + 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id) + + +class NRKTVSerieBaseIE(InfoExtractor): + def _extract_series(self, webpage, display_id, fatal=True): + config = self._parse_json( + self._search_regex( + (r'INITIAL_DATA_*\s*=\s*({.+?})\s*;', + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'), + webpage, 'config', default='{}' if not fatal else NO_DEFAULT), + display_id, fatal=False) + if not config: + return + return try_get( + config, + (lambda x: x['initialState']['series'], lambda x: x['series']), + dict) + + def _extract_seasons(self, seasons): + if not isinstance(seasons, list): + return [] + entries = [] + for season in seasons: + entries.extend(self._extract_episodes(season)) + return entries + + def _extract_episodes(self, season): + if not isinstance(season, dict): + return [] + return self._extract_entries(season.get('episodes')) + + def _extract_entries(self, entry_list): + if not isinstance(entry_list, list): + return [] + entries = [] + for episode in entry_list: + nrk_id = episode.get('prfId') + if not nrk_id or not isinstance(nrk_id, compat_str): + continue + entries.append(self.url_result( + 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) + return entries + + +class NRKTVSeasonIE(NRKTVSerieBaseIE): + _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)' + _TEST = { + 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', + 'info_dict': { + 'id': '1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 30, + } + + @classmethod + def suitable(cls, url): + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + else super(NRKTVSeasonIE, cls).suitable(url)) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + series = self._extract_series(webpage, display_id) + + season = next( + s for s in series['seasons'] + if int(display_id) == s.get('seasonNumber')) + + title = try_get(season, lambda x: x['titles']['title'], compat_str) + return self.playlist_result( + self._extract_episodes(season), display_id, title) + + +class NRKTVSeriesIE(NRKTVSerieBaseIE): + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' + _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' + _TESTS = [{ + # new layout, seasons + 'url': 'https://tv.nrk.no/serie/backstage', + 'info_dict': { + 'id': 'backstage', + 'title': 'Backstage', + 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', + }, + 'playlist_mincount': 60, + }, { + # new layout, instalments + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 10, + }, { + # old layout + 'url': 'https://tv.nrksuper.no/serie/labyrint', + 'info_dict': { + 'id': 'labyrint', + 'title': 'Labyrint', + 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/saving-the-human-race', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/postmann-pat', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return ( + False if any(ie.suitable(url) + for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE)) + else super(NRKTVSeriesIE, cls).suitable(url)) + + def _real_extract(self, url): + series_id = self._match_id(url) + + webpage = self._download_webpage(url, series_id) + + # New layout (e.g. https://tv.nrk.no/serie/backstage) + series = self._extract_series(webpage, series_id, fatal=False) + if series: + title = try_get(series, lambda x: x['titles']['title'], compat_str) + description = try_get( + series, lambda x: x['titles']['subtitle'], compat_str) + entries = [] + entries.extend(self._extract_seasons(series.get('seasons'))) + entries.extend(self._extract_entries(series.get('instalments'))) + entries.extend(self._extract_episodes(series.get('extraMaterial'))) + return self.playlist_result(entries, series_id, title, description) + + # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) + entries = [ + self.url_result( + 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( + series=series_id, season=season_id)) + for season_id in re.findall(self._ITEM_RE, webpage) + ] + + title = self._html_search_meta( + 'seriestitle', webpage, + 'title', default=None) or self._og_search_title( + webpage, fatal=False) + if title: + title = self._search_regex( + r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) + + description = self._html_search_meta( + 'series_description', webpage, + 'description', default=None) or self._og_search_description(webpage) + + return self.playlist_result(entries, series_id, title, description) + + class NRKTVDirekteIE(NRKTVIE): IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' @@ -411,7 +615,7 @@ class NRKPlaylistIE(NRKPlaylistBaseIE): 'title': 'Rivertonprisen til Karin Fossum', 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', }, - 'playlist_count': 5, + 'playlist_count': 2, }] def _extract_title(self, webpage): @@ -438,64 +642,6 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE): r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) -class NRKTVSeriesIE(InfoExtractor): - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' - _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'info_dict': { - 'id': 'groenn-glede', - 'title': 'Grønn glede', - 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', - }, - 'playlist_mincount': 9, - }, { - 'url': 'http://tv.nrksuper.no/serie/labyrint', - 'info_dict': { - 'id': 'labyrint', - 'title': 'Labyrint', - 'description': 'md5:58afd450974c89e27d5a19212eee7115', - }, - 'playlist_mincount': 3, - }, { - 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/saving-the-human-race', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/postmann-pat', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url) - - def _real_extract(self, url): - series_id = self._match_id(url) - - webpage = self._download_webpage(url, series_id) - - entries = [ - self.url_result( - 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, season=season_id)) - for season_id in re.findall(self._ITEM_RE, webpage) - ] - - title = self._html_search_meta( - 'seriestitle', webpage, - 'title', default=None) or self._og_search_title( - webpage, fatal=False) - - description = self._html_search_meta( - 'series_description', webpage, - 'description', default=None) or self._og_search_description(webpage) - - return self.playlist_result(entries, series_id, title, description) - - class NRKSkoleIE(InfoExtractor): IE_DESC = 'NRK Skole' _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)' diff --git a/youtube_dl/extractor/nzz.py b/youtube_dl/extractor/nzz.py index 2d352f53f..61ee77adb 100644 --- a/youtube_dl/extractor/nzz.py +++ b/youtube_dl/extractor/nzz.py @@ -11,20 +11,27 @@ from ..utils import ( class NZZIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nzz\.ch/(?:[^/]+/)*[^/?#]+-ld\.(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nzz.ch/zuerich/gymizyte/gymizyte-schreiben-schueler-heute-noch-diktate-ld.9153', 'info_dict': { 'id': '9153', }, 'playlist_mincount': 6, - } + }, { + 'url': 'https://www.nzz.ch/video/nzz-standpunkte/cvp-auf-der-suche-nach-dem-mass-der-mitte-ld.1368112', + 'info_dict': { + 'id': '1368112', + }, + 'playlist_count': 1, + }] def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) entries = [] - for player_element in re.findall(r'(<[^>]+class="kalturaPlayer"[^>]*>)', webpage): + for player_element in re.findall( + r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage): player_params = extract_attributes(player_element) if player_params.get('data-type') not in ('kaltura_singleArticle',): self.report_warning('Unsupported player type') diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 190d8af4d..114b93c07 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -115,6 +115,10 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#', 'only_matching': True, + }, { + # Paid video + 'url': 'https://ok.ru/video/954886983203', + 'only_matching': True, }] def _real_extract(self, url): @@ -244,6 +248,11 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'flv', }) + if not formats: + payment_info = metadata.get('paymentInfo') + if payment_info: + raise ExtractorError('This video is paid, subscribe to download it', expected=True) + self._sort_formats(formats) info['formats'] = formats diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d264fe206..c1dcbb7eb 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,18 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' + _VALID_URL = r'''(?x) + https?:// + (?P<host> + (?:www\.)? + (?: + openload\.(?:co|io|link|pw)| + oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw|live) + ) + )/ + (?:f|embed)/ + (?P<id>[a-zA-Z0-9-_]+) + ''' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -307,10 +318,37 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.download/f/kUEfGclsU9o', 'only_matching': True, + }, { + 'url': 'https://oload.cloud/f/4ZDnBXRWiB8', + 'only_matching': True, }, { # Its title has not got its extension but url has it 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', 'only_matching': True, + }, { + 'url': 'https://oload.cc/embed/5NEAbI2BDSk', + 'only_matching': True, + }, { + 'url': 'https://oload.icu/f/-_i4y_F_Hs8', + 'only_matching': True, + }, { + 'url': 'https://oload.fun/f/gb6G1H4sHXY', + 'only_matching': True, + }, { + 'url': 'https://oload.club/f/Nr1L-aZ2dbQ', + 'only_matching': True, + }, { + 'url': 'https://oload.info/f/5NEAbI2BDSk', + 'only_matching': True, + }, { + 'url': 'https://openload.pw/f/WyKgK8s94N0', + 'only_matching': True, + }, { + 'url': 'https://oload.pw/f/WyKgK8s94N0', + 'only_matching': True, + }, { + 'url': 'https://oload.live/f/-Z58UZ-GR4M', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' @@ -322,8 +360,11 @@ class OpenloadIE(InfoExtractor): webpage) def _real_extract(self, url): - video_id = self._match_id(url) - url_pattern = 'https://openload.co/%%s/%s/' % video_id + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + video_id = mobj.group('id') + + url_pattern = 'https://%s/%%s/%s/' % (host, video_id) headers = { 'User-Agent': self._USER_AGENT, } @@ -356,7 +397,7 @@ class OpenloadIE(InfoExtractor): r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, 'stream URL')) - video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id + video_url = 'https://%s/stream/%s?mime=true' % (host, decoded_id) title = self._og_search_title(webpage, default=None) or self._search_regex( r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, @@ -367,7 +408,7 @@ class OpenloadIE(InfoExtractor): entry = entries[0] if entries else {} subtitles = entry.get('subtitles') - info_dict = { + return { 'id': video_id, 'title': title, 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), @@ -376,4 +417,3 @@ class OpenloadIE(InfoExtractor): 'subtitles': subtitles, 'http_headers': headers, } - return info_dict diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index c1fb580ca..d432e3449 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -15,6 +15,7 @@ from ..utils import ( strip_jsonp, unescapeHTML, unified_strdate, + url_or_none, ) @@ -68,26 +69,35 @@ class ORFTVthekIE(InfoExtractor): webpage, 'playlist', group='json'), playlist_id, transform_source=unescapeHTML)['playlist']['videos'] - def quality_to_int(s): - m = re.search('([0-9]+)', s) - if m is None: - return -1 - return int(m.group(1)) - entries = [] for sd in data_jsb: video_id, title = sd.get('id'), sd.get('title') if not video_id or not title: continue video_id = compat_str(video_id) - formats = [{ - 'preference': -10 if fd['delivery'] == 'hls' else None, - 'format_id': '%s-%s-%s' % ( - fd['delivery'], fd['quality'], fd['quality_string']), - 'url': fd['src'], - 'protocol': fd['protocol'], - 'quality': quality_to_int(fd['quality']), - } for fd in sd['sources']] + formats = [] + for fd in sd['sources']: + src = url_or_none(fd.get('src')) + if not src: + continue + format_id_list = [] + for key in ('delivery', 'quality', 'quality_string'): + value = fd.get(key) + if value: + format_id_list.append(value) + format_id = '-'.join(format_id_list) + if determine_ext(fd['src']) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + fd['src'], video_id, 'mp4', m3u8_id=format_id)) + elif determine_ext(fd['src']) == 'f4m': + formats.extend(self._extract_f4m_formats( + fd['src'], video_id, f4m_id=format_id)) + else: + formats.append({ + 'format_id': format_id, + 'url': src, + 'protocol': fd.get('protocol'), + }) # Check for geoblocking. # There is a property is_geoprotection, but that's always false diff --git a/youtube_dl/extractor/outsidetv.py b/youtube_dl/extractor/outsidetv.py new file mode 100644 index 000000000..c5333b08c --- /dev/null +++ b/youtube_dl/extractor/outsidetv.py @@ -0,0 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OutsideTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?outsidetv\.com/(?:[^/]+/)*?play/[a-zA-Z0-9]{8}/\d+/\d+/(?P<id>[a-zA-Z0-9]{8})' + _TESTS = [{ + 'url': 'http://www.outsidetv.com/category/snow/play/ZjQYboH6/1/10/Hdg0jukV/4', + 'md5': '192d968fedc10b2f70ec31865ffba0da', + 'info_dict': { + 'id': 'Hdg0jukV', + 'ext': 'mp4', + 'title': 'Home - Jackson Ep 1 | Arbor Snowboards', + 'description': 'md5:41a12e94f3db3ca253b04bb1e8d8f4cd', + 'upload_date': '20181225', + 'timestamp': 1545742800, + } + }, { + 'url': 'http://www.outsidetv.com/home/play/ZjQYboH6/1/10/Hdg0jukV/4', + 'only_matching': True, + }] + + def _real_extract(self, url): + jw_media_id = self._match_id(url) + return self.url_result( + 'jwplatform:' + jw_media_id, 'JWPlatform', jw_media_id) diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 56a2a1083..1324137df 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -24,9 +24,9 @@ class PacktPubBaseIE(InfoExtractor): class PacktPubIE(PacktPubBaseIE): - _VALID_URL = r'https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>\d+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>\d+)/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro', 'md5': '1e74bd6cfd45d7d07666f4684ef58f70', 'info_dict': { @@ -37,7 +37,10 @@ class PacktPubIE(PacktPubBaseIE): 'timestamp': 1490918400, 'upload_date': '20170331', }, - } + }, { + 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215/20528/20530/project-intro', + 'only_matching': True, + }] _NETRC_MACHINE = 'packtpub' _TOKEN = None @@ -110,15 +113,18 @@ class PacktPubIE(PacktPubBaseIE): class PacktPubCourseIE(PacktPubBaseIE): - _VALID_URL = r'(?P<url>https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<id>\d+))' - _TEST = { + _VALID_URL = r'(?P<url>https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P<id>\d+))' + _TESTS = [{ 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215', 'info_dict': { 'id': '9781787122215', 'title': 'Learn Nodejs by building 12 projects [Video]', }, 'playlist_count': 90, - } + }, { + 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215', + 'only_matching': True, + }] @classmethod def suitable(cls, url): diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 9eb027679..426dd8121 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -2,52 +2,63 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + parse_iso8601, +) class PatreonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(?P<id>[^&#]+)' - _TESTS = [ - { - 'url': 'http://www.patreon.com/creation?hid=743933', - 'md5': 'e25505eec1053a6e6813b8ed369875cc', - 'info_dict': { - 'id': '743933', - 'ext': 'mp3', - 'title': 'Episode 166: David Smalley of Dogma Debate', - 'uploader': 'Cognitive Dissonance Podcast', - 'thumbnail': 're:^https?://.*$', - }, + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.patreon.com/creation?hid=743933', + 'md5': 'e25505eec1053a6e6813b8ed369875cc', + 'info_dict': { + 'id': '743933', + 'ext': 'mp3', + 'title': 'Episode 166: David Smalley of Dogma Debate', + 'description': 'md5:713b08b772cd6271b9f3906683cfacdf', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', + 'timestamp': 1406473987, + 'upload_date': '20140727', }, - { - 'url': 'http://www.patreon.com/creation?hid=754133', - 'md5': '3eb09345bf44bf60451b8b0b81759d0a', - 'info_dict': { - 'id': '754133', - 'ext': 'mp3', - 'title': 'CD 167 Extra', - 'uploader': 'Cognitive Dissonance Podcast', - 'thumbnail': 're:^https?://.*$', - }, + }, { + 'url': 'http://www.patreon.com/creation?hid=754133', + 'md5': '3eb09345bf44bf60451b8b0b81759d0a', + 'info_dict': { + 'id': '754133', + 'ext': 'mp3', + 'title': 'CD 167 Extra', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', }, - { - 'url': 'https://www.patreon.com/creation?hid=1682498', - 'info_dict': { - 'id': 'SU4fj_aEMVw', - 'ext': 'mp4', - 'title': 'I\'m on Patreon!', - 'uploader': 'TraciJHines', - 'thumbnail': 're:^https?://.*$', - 'upload_date': '20150211', - 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', - 'uploader_id': 'TraciJHines', - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - } + 'skip': 'Patron-only content', + }, { + 'url': 'https://www.patreon.com/creation?hid=1682498', + 'info_dict': { + 'id': 'SU4fj_aEMVw', + 'ext': 'mp4', + 'title': 'I\'m on Patreon!', + 'uploader': 'TraciJHines', + 'thumbnail': 're:^https?://.*$', + 'upload_date': '20150211', + 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', + 'uploader_id': 'TraciJHines', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, } - ] + }, { + 'url': 'https://www.patreon.com/posts/episode-166-of-743933', + 'only_matching': True, + }, { + 'url': 'https://www.patreon.com/posts/743933', + 'only_matching': True, + }] # Currently Patreon exposes download URL via hidden CSS, so login is not # needed. Keeping this commented for when this inevitably changes. @@ -78,38 +89,48 @@ class PatreonIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage).strip() - - attach_fn = self._html_search_regex( - r'<div class="attach"><a target="_blank" href="([^"]+)">', - webpage, 'attachment URL', default=None) - embed = self._html_search_regex( - r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"', - webpage, 'embedded URL', default=None) - - if attach_fn is not None: - video_url = 'http://www.patreon.com' + attach_fn - thumbnail = self._og_search_thumbnail(webpage) - uploader = self._html_search_regex( - r'<strong>(.*?)</strong> is creating', webpage, 'uploader') - elif embed is not None: - return self.url_result(embed) - else: - playlist = self._parse_json(self._search_regex( - r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])', - webpage, 'playlist JSON'), - video_id, transform_source=js_to_json) - data = playlist[0] - video_url = self._proto_relative_url(data['mp3']) - thumbnail = self._proto_relative_url(data.get('cover')) - uploader = data.get('artist') - - return { + post = self._download_json( + 'https://www.patreon.com/api/posts/' + video_id, video_id) + attributes = post['data']['attributes'] + title = attributes['title'].strip() + image = attributes.get('image') or {} + info = { 'id': video_id, - 'url': video_url, - 'ext': 'mp3', 'title': title, - 'uploader': uploader, - 'thumbnail': thumbnail, + 'description': clean_html(attributes.get('content')), + 'thumbnail': image.get('large_url') or image.get('url'), + 'timestamp': parse_iso8601(attributes.get('published_at')), + 'like_count': int_or_none(attributes.get('like_count')), + 'comment_count': int_or_none(attributes.get('comment_count')), } + + def add_file(file_data): + file_url = file_data.get('url') + if file_url: + info.update({ + 'url': file_url, + 'ext': determine_ext(file_data.get('name'), 'mp3'), + }) + + for i in post.get('included', []): + i_type = i.get('type') + if i_type == 'attachment': + add_file(i.get('attributes') or {}) + elif i_type == 'user': + user_attributes = i.get('attributes') + if user_attributes: + info.update({ + 'uploader': user_attributes.get('full_name'), + 'uploader_url': user_attributes.get('url'), + }) + + if not info.get('url'): + add_file(attributes.get('post_file') or {}) + + if not info.get('url'): + info.update({ + '_type': 'url', + 'url': attributes['embed']['url'], + }) + + return info diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8d6f2dd3d..80340f595 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -14,6 +15,7 @@ from ..utils import ( strip_jsonp, strip_or_none, unified_strdate, + url_or_none, US_RATINGS, ) @@ -375,6 +377,35 @@ class PBSIE(InfoExtractor): }, 'expected_warnings': ['HTTP Error 403: Forbidden'], }, + { + 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', + 'info_dict': { + 'id': '3007193718', + 'ext': 'mp4', + 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", + 'description': 'md5:37efbac85e0c09b009586523ec143652', + 'duration': 6292, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', + 'info_dict': { + 'id': '3011407934', + 'ext': 'mp4', + 'title': 'Stories from the Stage - Road Trip', + 'duration': 1619, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -438,6 +469,7 @@ class PBSIE(InfoExtractor): r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ + r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ ] media_id = self._search_regex( @@ -472,7 +504,8 @@ class PBSIE(InfoExtractor): if not url: url = self._og_search_url(webpage) - mobj = re.match(self._VALID_URL, url) + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) player_id = mobj.group('player_id') if not display_id: @@ -482,13 +515,27 @@ class PBSIE(InfoExtractor): url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( - r'<div\s+id="video_([0-9]+)"', player_page, 'video ID') + r'<div\s+id=["\']video_(\d+)', player_page, 'video ID', + default=None) + if not video_id: + video_info = self._extract_video_data( + player_page, 'video data', display_id) + video_id = compat_str( + video_info.get('id') or video_info['contentID']) else: video_id = mobj.group('id') display_id = video_id return video_id, display_id, None, description + def _extract_video_data(self, string, name, video_id, fatal=True): + return self._parse_json( + self._search_regex( + [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', + r'window\.videoBridge\s*=\s*({.+?});'], + string, name, default='{}'), + video_id, transform_source=js_to_json, fatal=fatal) + def _real_extract(self, url): video_id, display_id, upload_date, description = self._extract_webpage(url) @@ -511,6 +558,13 @@ class PBSIE(InfoExtractor): if redirect_url and redirect_url not in redirect_urls: redirects.append(redirect) redirect_urls.add(redirect_url) + encodings = info.get('encodings') + if isinstance(encodings, list): + for encoding in encodings: + encoding_url = url_or_none(encoding) + if encoding_url and encoding_url not in redirect_urls: + redirects.append({'url': encoding_url}) + redirect_urls.add(encoding_url) chapters = [] # Player pages may also serve different qualities @@ -519,11 +573,8 @@ class PBSIE(InfoExtractor): 'http://player.pbs.org/%s/%s' % (page, video_id), display_id, 'Downloading %s page' % page, fatal=False) if player: - video_info = self._parse_json( - self._search_regex( - [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', r'window\.videoBridge\s*=\s*({.+?});'], - player, '%s video data' % page, default='{}'), - display_id, transform_source=js_to_json, fatal=False) + video_info = self._extract_video_data( + player, '%s video data' % page, display_id, fatal=False) if video_info: extract_redirect_urls(video_info) if not info: diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index a481b3151..e03c3d1d3 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -10,6 +10,7 @@ from ..utils import ( parse_resolution, try_get, unified_timestamp, + url_or_none, urljoin, ) @@ -116,12 +117,14 @@ class PeerTubeIE(InfoExtractor): videos\.tcit\.fr| peertube\.cpy\.re )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _VALID_URL = r'''(?x) - https?:// - %s - /(?:videos/(?:watch|embed)|api/v\d/videos)/ - (?P<id>[^/?\#&]+) - ''' % _INSTANCES_RE + (?: + peertube:(?P<host>[^:]+):| + https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ + ) + (?P<id>%s) + ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', 'md5': '80f24ff364cc9d333529506a263e7feb', @@ -157,21 +160,40 @@ class PeerTubeIE(InfoExtractor): }, { 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', 'only_matching': True, + }, { + 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, }] @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'''(?x)<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s/videos/embed/[^/?\#&]+)\1''' - % PeerTubeIE._INSTANCES_RE, webpage)] + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P<host>[^/]+)/videos/watch/(?P<id>%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + '<title>PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' + % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) + if not entries: + peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) + if peertube_url: + entries = [peertube_url] + return entries def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') video = self._download_json( - urljoin(url, '/api/v1/videos/%s' % video_id), video_id) + 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) title = video['name'] @@ -179,8 +201,8 @@ class PeerTubeIE(InfoExtractor): for file_ in video['files']: if not isinstance(file_, dict): continue - file_url = file_.get('fileUrl') - if not file_url or not isinstance(file_url, compat_str): + file_url = url_or_none(file_.get('fileUrl')) + if not file_url: continue file_size = int_or_none(file_.get('size')) format_id = try_get( diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py index f1008ae51..f723a2b3b 100644 --- a/youtube_dl/extractor/philharmoniedeparis.py +++ b/youtube_dl/extractor/philharmoniedeparis.py @@ -2,31 +2,38 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - float_or_none, - int_or_none, - parse_iso8601, - xpath_text, + try_get, + urljoin, ) class PhilharmonieDeParisIE(InfoExtractor): IE_DESC = 'Philharmonie de Paris' - _VALID_URL = r'https?://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)| + pad\.philharmoniedeparis\.fr/doc/CIMU/ + ) + (?P<id>\d+) + ''' _TESTS = [{ + 'url': 'http://pad.philharmoniedeparis.fr/doc/CIMU/1086697/jazz-a-la-villette-knower', + 'md5': 'a0a4b195f544645073631cbec166a2c2', + 'info_dict': { + 'id': '1086697', + 'ext': 'mp4', + 'title': 'Jazz à la Villette : Knower', + }, + }, { 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', 'info_dict': { 'id': '1032066', - 'ext': 'flv', - 'title': 'md5:d1f5585d87d041d07ce9434804bc8425', - 'timestamp': 1428179400, - 'upload_date': '20150404', - 'duration': 6592.278, + 'title': 'md5:0a031b81807b3593cffa3c9a87a167a0', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'playlist_mincount': 2, }, { 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html', 'only_matching': True, @@ -34,45 +41,60 @@ class PhilharmonieDeParisIE(InfoExtractor): 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', 'only_matching': True, }] + _LIVE_URL = 'https://live.philharmoniedeparis.fr' def _real_extract(self, url): video_id = self._match_id(url) - concert = self._download_xml( - 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=%s' % video_id, - video_id).find('./concert') + config = self._download_json( + '%s/otoPlayer/config.ashx' % self._LIVE_URL, video_id, query={ + 'id': video_id, + 'lang': 'fr-FR', + }) - formats = [] - info_dict = { - 'id': video_id, - 'title': xpath_text(concert, './titre', 'title', fatal=True), - 'formats': formats, - } - - fichiers = concert.find('./fichiers') - stream = fichiers.attrib['serveurstream'] - for fichier in fichiers.findall('./fichier'): - info_dict['duration'] = float_or_none(fichier.get('timecodefin')) - for quality, (format_id, suffix) in enumerate([('lq', ''), ('hq', '_hd')]): - format_url = fichier.get('url%s' % suffix) - if not format_url: + def extract_entry(source): + if not isinstance(source, dict): + return + title = source.get('title') + if not title: + return + files = source.get('files') + if not isinstance(files, dict): + return + format_urls = set() + formats = [] + for format_id in ('mobile', 'desktop'): + format_url = try_get( + files, lambda x: x[format_id]['file'], compat_str) + if not format_url or format_url in format_urls: continue - formats.append({ - 'url': stream, - 'play_path': format_url, - 'ext': 'flv', - 'format_id': format_id, - 'width': int_or_none(concert.get('largeur%s' % suffix)), - 'height': int_or_none(concert.get('hauteur%s' % suffix)), - 'quality': quality, - }) - self._sort_formats(formats) + format_urls.add(format_url) + m3u8_url = urljoin(self._LIVE_URL, format_url) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + if not formats: + return + self._sort_formats(formats) + return { + 'title': title, + 'formats': formats, + } - date, hour = concert.get('date'), concert.get('heure') - if date and hour: - info_dict['timestamp'] = parse_iso8601( - '%s-%s-%sT%s:00' % (date[0:4], date[4:6], date[6:8], hour)) - elif date: - info_dict['upload_date'] = date + thumbnail = urljoin(self._LIVE_URL, config.get('image')) - return info_dict + info = extract_entry(config) + if info: + info.update({ + 'id': video_id, + 'thumbnail': thumbnail, + }) + return info + + entries = [] + for num, chapter in enumerate(config['chapters'], start=1): + entry = extract_entry(chapter) + entry['id'] = '%s-%d' % (video_id, num) + entries.append(entry) + + return self.playlist_result(entries, video_id, config.get('title')) diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py index 2366dfb34..8099ef1d6 100644 --- a/youtube_dl/extractor/picarto.py +++ b/youtube_dl/extractor/picarto.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re import time from .common import InfoExtractor @@ -15,7 +16,7 @@ from ..utils import ( class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)(?:/(?P<token>[a-zA-Z0-9]+))?' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -33,20 +34,14 @@ class PicartoIE(InfoExtractor): return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) def _real_extract(self, url): - channel_id = self._match_id(url) - stream_page = self._download_webpage(url, channel_id) + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') - if '>This channel does not exist' in stream_page: - raise ExtractorError( - 'Channel %s does not exist' % channel_id, expected=True) + metadata = self._download_json( + 'https://api.picarto.tv/v1/channel/name/' + channel_id, + channel_id) - player = self._parse_json( - self._search_regex( - r'(?s)playerSettings\[\d+\]\s*=\s*(\{.+?\}\s*\n)', stream_page, - 'player settings'), - channel_id, transform_source=js_to_json) - - if player.get('online') is False: + if metadata.get('online') is False: raise ExtractorError('Stream is offline', expected=True) cdn_data = self._download_json( @@ -54,20 +49,13 @@ class PicartoIE(InfoExtractor): data=urlencode_postdata({'loadbalancinginfo': channel_id}), note='Downloading load balancing info') - def get_event(key): - return try_get(player, lambda x: x['event'][key], compat_str) or '' - + token = mobj.group('token') or 'public' params = { - 'token': player.get('token') or '', - 'ticket': get_event('ticket'), 'con': int(time.time() * 1000), - 'type': get_event('ticket'), - 'scope': get_event('scope'), + 'token': token, } prefered_edge = cdn_data.get('preferedEdge') - default_tech = player.get('defaultTech') - formats = [] for edge in cdn_data['edges']: @@ -81,8 +69,6 @@ class PicartoIE(InfoExtractor): preference = 0 if edge_id == prefered_edge: preference += 1 - if tech_type == default_tech: - preference += 1 format_id = [] if edge_id: format_id.append(edge_id) @@ -109,7 +95,7 @@ class PicartoIE(InfoExtractor): continue self._sort_formats(formats) - mature = player.get('mature') + mature = metadata.get('adult') if mature is None: age_limit = None else: @@ -117,9 +103,11 @@ class PicartoIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(channel_id), + 'title': self._live_title(metadata.get('title') or channel_id), 'is_live': True, - 'thumbnail': player.get('vodThumb'), + 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), + 'channel': channel_id, + 'channel_url': 'https://picarto.tv/%s' % channel_id, 'age_limit': age_limit, 'formats': formats, } diff --git a/youtube_dl/extractor/playplustv.py b/youtube_dl/extractor/playplustv.py new file mode 100644 index 000000000..1e30ab23a --- /dev/null +++ b/youtube_dl/extractor/playplustv.py @@ -0,0 +1,109 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + PUTRequest, +) + + +class PlayPlusTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' + _TEST = { + 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e', + 'md5': 'd078cb89d7ab6b9df37ce23c647aef72', + 'info_dict': { + 'id': 'db8d274a5163424e967f35a30ddafb8e', + 'ext': 'mp4', + 'title': 'Capítulo 179 - Final', + 'description': 'md5:01085d62d8033a1e34121d3c3cabc838', + 'timestamp': 1529992740, + 'upload_date': '20180626', + }, + 'skip': 'Requires account credential', + } + _NETRC_MACHINE = 'playplustv' + _GEO_COUNTRIES = ['BR'] + _token = None + _profile_id = None + + def _call_api(self, resource, video_id=None, query=None): + return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={ + 'Authorization': 'Bearer ' + self._token, + }, query=query) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + self.raise_login_required() + + req = PUTRequest( + 'https://api.playplus.tv/api/web/login', json.dumps({ + 'email': email, + 'password': password, + }).encode(), { + 'Content-Type': 'application/json; charset=utf-8', + }) + + try: + self._token = self._download_json(req, None)['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(self._parse_json( + e.cause.read(), None)['errorMessage'], expected=True) + raise + + self._profile = self._call_api('Profiles')['list'][0]['_id'] + + def _real_extract(self, url): + project_id, media_id = re.match(self._VALID_URL, url).groups() + media = self._call_api( + 'Media', media_id, { + 'profileId': self._profile, + 'projectId': project_id, + 'mediaId': media_id, + })['obj'] + title = media['title'] + + formats = [] + for f in media.get('files', []): + f_url = f.get('url') + if not f_url: + continue + file_info = f.get('fileInfo') or {} + formats.append({ + 'url': f_url, + 'width': int_or_none(file_info.get('width')), + 'height': int_or_none(file_info.get('height')), + }) + self._sort_formats(formats) + + thumbnails = [] + for thumb in media.get('thumbs', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'url': thumb_url, + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + return { + 'id': media_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clean_html(media.get('description')) or media.get('shortDescription'), + 'timestamp': int_or_none(media.get('publishDate'), 1000), + 'view_count': int_or_none(media.get('numberOfViews')), + 'comment_count': int_or_none(media.get('numberOfComments')), + 'tags': media.get('tags'), + } diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index a207ca9cb..eafe56897 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -4,6 +4,7 @@ import collections import json import os import random +import re from .common import InfoExtractor from ..compat import ( @@ -27,6 +28,60 @@ from ..utils import ( class PluralsightBaseIE(InfoExtractor): _API_BASE = 'https://app.pluralsight.com' + _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE + _GRAPHQL_HEADERS = { + 'Content-Type': 'application/json;charset=UTF-8', + } + _GRAPHQL_COURSE_TMPL = ''' +query BootstrapPlayer { + rpc { + bootstrapPlayer { + profile { + firstName + lastName + email + username + userHandle + authed + isAuthed + plan + } + course(courseId: "%s") { + name + title + courseHasCaptions + translationLanguages { + code + name + } + supportsWideScreenVideoFormats + timestamp + modules { + name + title + duration + formattedDuration + author + authorized + clips { + authorized + clipId + duration + formattedDuration + id + index + moduleIndex + moduleTitle + name + title + watched + } + } + } + } + } +}''' + def _download_course(self, course_id, url, display_id): try: return self._download_course_rpc(course_id, url, display_id) @@ -39,20 +94,14 @@ class PluralsightBaseIE(InfoExtractor): def _download_course_rpc(self, course_id, url, display_id): response = self._download_json( - '%s/player/functions/rpc' % self._API_BASE, display_id, - 'Downloading course JSON', - data=json.dumps({ - 'fn': 'bootstrapPlayer', - 'payload': { - 'courseId': course_id, - }, - }).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) + self._GRAPHQL_EP, display_id, data=json.dumps({ + 'query': self._GRAPHQL_COURSE_TMPL % course_id, + 'variables': {} + }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) - course = try_get(response, lambda x: x['payload']['course'], dict) + course = try_get( + response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], + dict) if course: return course @@ -90,6 +139,28 @@ class PluralsightIE(PluralsightBaseIE): 'only_matching': True, }] + GRAPHQL_VIEWCLIP_TMPL = ''' +query viewClip { + viewClip(input: { + author: "%(author)s", + clipIndex: %(clipIndex)d, + courseName: "%(courseName)s", + includeCaptions: %(includeCaptions)s, + locale: "%(locale)s", + mediaType: "%(mediaType)s", + moduleName: "%(moduleName)s", + quality: "%(quality)s" + }) { + urls { + url + cdn + rank + source + }, + status + } +}''' + def _real_initialize(self): self._login() @@ -126,7 +197,10 @@ class PluralsightIE(PluralsightBaseIE): if error: raise ExtractorError('Unable to login: %s' % error, expected=True) - if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')): + if all(not re.search(p, response) for p in ( + r'__INITIAL_STATE__', r'["\']currentUser["\']', + # new layout? + r'>\s*Sign out\s*<')): BLOCKED = 'Your account has been blocked due to suspicious activity' if BLOCKED in response: raise ExtractorError( @@ -140,18 +214,26 @@ class PluralsightIE(PluralsightBaseIE): raise ExtractorError('Unable to log in') - def _get_subtitles(self, author, clip_idx, lang, name, duration, video_id): - captions_post = { - 'a': author, - 'cn': clip_idx, - 'lc': lang, - 'm': name, - } - captions = self._download_json( - '%s/player/retrieve-captions' % self._API_BASE, video_id, - 'Downloading captions JSON', 'Unable to download captions JSON', - fatal=False, data=json.dumps(captions_post).encode('utf-8'), - headers={'Content-Type': 'application/json;charset=utf-8'}) + def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): + captions = None + if clip_id: + captions = self._download_json( + '%s/transcript/api/v1/caption/json/%s/%s' + % (self._API_BASE, clip_id, lang), video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False) + if not captions: + captions_post = { + 'a': author, + 'cn': int(clip_idx), + 'lc': lang, + 'm': name, + } + captions = self._download_json( + '%s/player/retrieve-captions' % self._API_BASE, video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False, data=json.dumps(captions_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) if captions: return { lang: [{ @@ -277,7 +359,7 @@ class PluralsightIE(PluralsightBaseIE): f = QUALITIES[quality].copy() clip_post = { 'author': author, - 'includeCaptions': False, + 'includeCaptions': 'false', 'clipIndex': int(clip_idx), 'courseName': course_name, 'locale': 'en', @@ -286,11 +368,23 @@ class PluralsightIE(PluralsightBaseIE): 'quality': '%dx%d' % (f['width'], f['height']), } format_id = '%s-%s' % (ext, quality) - viewclip = self._download_json( - '%s/video/clips/viewclip' % self._API_BASE, display_id, - 'Downloading %s viewclip JSON' % format_id, fatal=False, - data=json.dumps(clip_post).encode('utf-8'), - headers={'Content-Type': 'application/json;charset=utf-8'}) + + try: + viewclip = self._download_json( + self._GRAPHQL_EP, display_id, + 'Downloading %s viewclip graphql' % format_id, + data=json.dumps({ + 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, + 'variables': {} + }).encode('utf-8'), + headers=self._GRAPHQL_HEADERS)['data']['viewClip'] + except ExtractorError: + # Still works but most likely will go soon + viewclip = self._download_json( + '%s/video/clips/viewclip' % self._API_BASE, display_id, + 'Downloading %s viewclip JSON' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) # Pluralsight tracks multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see @@ -331,7 +425,7 @@ class PluralsightIE(PluralsightBaseIE): # TODO: other languages? subtitles = self.extract_subtitles( - author, clip_idx, 'en', name, duration, display_id) + author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) return { 'id': clip_id, diff --git a/youtube_dl/extractor/popcorntv.py b/youtube_dl/extractor/popcorntv.py index ac901f426..9f834fb6c 100644 --- a/youtube_dl/extractor/popcorntv.py +++ b/youtube_dl/extractor/popcorntv.py @@ -58,8 +58,6 @@ class PopcornTVIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) timestamp = unified_timestamp(self._html_search_meta( 'uploadDate', webpage, 'timestamp')) - print(self._html_search_meta( - 'duration', webpage)) duration = int_or_none(self._html_search_meta( 'duration', webpage), invscale=60) view_count = int_or_none(self._html_search_meta( diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py index 60ade06da..5726cab3a 100644 --- a/youtube_dl/extractor/porncom.py +++ b/youtube_dl/extractor/porncom.py @@ -43,7 +43,8 @@ class PornComIE(InfoExtractor): config = self._parse_json( self._search_regex( - r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', + (r'=\s*({.+?})\s*;\s*v1ar\b', + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), webpage, 'config', default='{}'), display_id, transform_source=js_to_json, fatal=False) @@ -69,7 +70,7 @@ class PornComIE(InfoExtractor): 'height': int(height), 'filesize_approx': parse_filesize(filesize), } for format_url, height, filesize in re.findall( - r'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<', + r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<', webpage)] thumbnail = None duration = None diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index b52879c7a..27d65d4b9 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -4,9 +4,11 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, js_to_json, + urljoin, ) @@ -14,7 +16,7 @@ class PornHdIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' _TESTS = [{ 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', - 'md5': 'c8b964b1f0a4b5f7f28ae3a5c9f86ad5', + 'md5': '87f1540746c1d32ec7a2305c12b96b25', 'info_dict': { 'id': '9864', 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', @@ -23,6 +25,7 @@ class PornHdIE(InfoExtractor): 'description': 'md5:3748420395e03e31ac96857a8f125b2b', 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, + 'like_count': int, 'age_limit': 18, } }, { @@ -37,6 +40,7 @@ class PornHdIE(InfoExtractor): 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, + 'like_count': int, 'age_limit': 18, }, 'skip': 'Not available anymore', @@ -65,12 +69,14 @@ class PornHdIE(InfoExtractor): formats = [] for format_id, video_url in sources.items(): + video_url = urljoin(url, video_url) if not video_url: continue height = int_or_none(self._search_regex( r'^(\d+)[pP]', format_id, 'height', default=None)) formats.append({ 'url': video_url, + 'ext': determine_ext(video_url, 'mp4'), 'format_id': format_id, 'height': height, }) @@ -85,6 +91,11 @@ class PornHdIE(InfoExtractor): r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage, 'thumbnail', fatal=False, group='url') + like_count = int_or_none(self._search_regex( + (r'(\d+)\s*</11[^>]+>(?: |\s)*\blikes', + r'class=["\']save-count["\'][^>]*>\s*(\d+)'), + webpage, 'like count', fatal=False)) + return { 'id': video_id, 'display_id': display_id, @@ -92,6 +103,7 @@ class PornHdIE(InfoExtractor): 'description': description, 'thumbnail': thumbnail, 'view_count': view_count, + 'like_count': like_count, 'formats': formats, 'age_limit': 18, } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 23e24d216..641083da7 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -4,36 +4,53 @@ from __future__ import unicode_literals import functools import itertools import operator -# import os import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - # compat_urllib_parse_unquote, - # compat_urllib_parse_unquote_plus, - # compat_urllib_parse_urlparse, + compat_str, + compat_urllib_request, ) +from .openload import PhantomJSwrapper from ..utils import ( ExtractorError, int_or_none, - js_to_json, orderedSet, - # sanitized_Request, remove_quotes, str_to_int, + url_or_none, ) -# from ..aes import ( -# aes_decrypt_text -# ) -class PornHubIE(InfoExtractor): +class PornHubBaseIE(InfoExtractor): + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) + + webpage, urlh = dl(*args, **kwargs) + + if any(re.search(p, webpage) for p in ( + r'<body\b[^>]+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.get_full_url() + if isinstance(url_or_request, compat_urllib_request.Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + + +class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -46,6 +63,7 @@ class PornHubIE(InfoExtractor): 'ext': 'mp4', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 'uploader': 'Babes', + 'upload_date': '20130628', 'duration': 361, 'view_count': int, 'like_count': int, @@ -62,7 +80,8 @@ class PornHubIE(InfoExtractor): 'id': '1331683002', 'ext': 'mp4', 'title': '重庆婷婷女王足交', - 'uploader': 'cj397186295', + 'uploader': 'Unknown', + 'upload_date': '20150213', 'duration': 1753, 'view_count': int, 'like_count': int, @@ -75,6 +94,31 @@ class PornHubIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # subtitles + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', + 'info_dict': { + 'id': 'ph5af5fef7c2aa7', + 'ext': 'mp4', + 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', + 'uploader': 'BFFs', + 'duration': 622, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'subtitles': { + 'en': [{ + "ext": 'srt' + }] + }, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, @@ -100,12 +144,15 @@ class PornHubIE(InfoExtractor): }, { 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', 'only_matching': True, + }, { + 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)', + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -113,15 +160,17 @@ class PornHubIE(InfoExtractor): pattern, webpage, '%s count' % name, fatal=False)) def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or 'pornhub.com' + video_id = mobj.group('id') - self._set_cookie('pornhub.com', 'age_verified', '1') + self._set_cookie(host, 'age_verified', '1') def dl_webpage(platform): - self._set_cookie('pornhub.com', 'platform', platform) + self._set_cookie(host, 'platform', platform) return self._download_webpage( - 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, - video_id) + 'http://www.%s/view_video.php?viewkey=%s' % (host, video_id), + video_id, 'Downloading %s webpage' % platform) webpage = dl_webpage('pc') @@ -134,60 +183,114 @@ class PornHubIE(InfoExtractor): 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) - tv_webpage = dl_webpage('tv') - - assignments = self._search_regex( - r'(var.+?mediastring.+?)</script>', tv_webpage, - 'encoded url').split(';') - - js_vars = {} - - def parse_js_value(inp): - inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) - if '+' in inp: - inps = inp.split('+') - return functools.reduce( - operator.concat, map(parse_js_value, inps)) - inp = inp.strip() - if inp in js_vars: - return js_vars[inp] - return remove_quotes(inp) - - for assn in assignments: - assn = assn.strip() - if not assn: - continue - assn = re.sub(r'var\s+', '', assn) - vname, value = assn.split('=', 1) - js_vars[vname] = parse_js_value(value) - - video_url = js_vars['mediastring'] - - title = self._search_regex( - r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) - # video_title from flashvars contains whitespace instead of non-ASCII (see # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. - title = title or self._html_search_meta( + title = self._html_search_meta( 'twitter:title', webpage, default=None) or self._search_regex( (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)', r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), webpage, 'title', group='title') + video_urls = [] + video_urls_set = set() + subtitles = {} + flashvars = self._parse_json( self._search_regex( r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), video_id) if flashvars: + subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) + if subtitle_url: + subtitles.setdefault('en', []).append({ + 'url': subtitle_url, + 'ext': 'srt', + }) thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) + media_definitions = flashvars.get('mediaDefinitions') + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if video_url in video_urls_set: + continue + video_urls_set.add(video_url) + video_urls.append( + (video_url, int_or_none(definition.get('quality')))) else: - title, thumbnail, duration = [None] * 3 + thumbnail, duration = [None] * 2 + + if not video_urls: + tv_webpage = dl_webpage('tv') + + assignments = self._search_regex( + r'(var.+?mediastring.+?)</script>', tv_webpage, + 'encoded url').split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + + video_url = js_vars['mediastring'] + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + for mobj in re.finditer( + r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage): + video_url = mobj.group('url') + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + upload_date = None + formats = [] + for video_url, height in video_urls: + if not upload_date: + upload_date = self._search_regex( + r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) + if upload_date: + upload_date = upload_date.replace('/', '') + tbr = None + mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + self._sort_formats(formats) video_uploader = self._html_search_regex( - r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', + r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', webpage, 'uploader', fatal=False) view_count = self._extract_count( @@ -199,19 +302,17 @@ class PornHubIE(InfoExtractor): comment_count = self._extract_count( r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') - page_params = self._parse_json(self._search_regex( - r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})', - webpage, 'page parameters', group='data', default='{}'), - video_id, transform_source=js_to_json, fatal=False) - tags = categories = None - if page_params: - tags = page_params.get('tags', '').split(',') - categories = page_params.get('categories', '').split(',') + def extract_list(meta_key): + div = self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>' + % meta_key, webpage, meta_key, default=None) + if div: + return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div) return { 'id': video_id, - 'url': video_url, 'uploader': video_uploader, + 'upload_date': upload_date, 'title': title, 'thumbnail': thumbnail, 'duration': duration, @@ -219,15 +320,16 @@ class PornHubIE(InfoExtractor): 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - # 'formats': formats, + 'formats': formats, 'age_limit': 18, - 'tags': tags, - 'categories': categories, + 'tags': extract_list('tags'), + 'categories': extract_list('categories'), + 'subtitles': subtitles, } -class PornHubPlaylistBaseIE(InfoExtractor): - def _extract_entries(self, webpage): +class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see # https://github.com/rg3/youtube-dl/issues/11594). @@ -237,7 +339,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): return [ self.url_result( - 'http://www.pornhub.com/%s' % video_url, + 'http://www.%s/%s' % (host, video_url), PornHubIE.ie_key(), video_title=title) for video_url, title in orderedSet(re.findall( r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', @@ -245,11 +347,13 @@ class PornHubPlaylistBaseIE(InfoExtractor): ] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + playlist_id = mobj.group('id') webpage = self._download_webpage(url, playlist_id) - entries = self._extract_entries(webpage) + entries = self._extract_entries(webpage, host) playlist = self._parse_json( self._search_regex( @@ -264,7 +368,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): class PornHubPlaylistIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/playlist/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.pornhub.com/playlist/4667351', 'info_dict': { @@ -279,7 +383,7 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:user|channel)s/(?P<id>[^/]+)/videos' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { @@ -311,10 +415,18 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): }, { 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/model/jayndrea/videos/upload', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', + 'only_matching': True, }] def _real_extract(self, url): - user_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + user_id = mobj.group('id') entries = [] for page_num in itertools.count(1): @@ -326,7 +438,7 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: break raise - page_entries = self._extract_entries(webpage) + page_entries = self._extract_entries(webpage, host) if not page_entries: break entries.extend(page_entries) diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py new file mode 100644 index 000000000..5465e8ab7 --- /dev/null +++ b/youtube_dl/extractor/puhutv.py @@ -0,0 +1,247 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + parse_resolution, + str_or_none, + try_get, + unified_timestamp, + url_or_none, + urljoin, +) + + +class PuhuTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle' + IE_NAME = 'puhutv' + _TESTS = [{ + # film + 'url': 'https://puhutv.com/sut-kardesler-izle', + 'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7', + 'info_dict': { + 'id': '5085', + 'display_id': 'sut-kardesler', + 'ext': 'mp4', + 'title': 'Süt Kardeşler', + 'description': 'md5:405fd024df916ca16731114eb18e511a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4832.44, + 'creator': 'Arzu Film', + 'timestamp': 1469778212, + 'upload_date': '20160729', + 'release_year': 1976, + 'view_count': int, + 'tags': ['Aile', 'Komedi', 'Klasikler'], + }, + }, { + # episode, geo restricted, bypassable with --geo-verification-proxy + 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', + 'only_matching': True, + }, { + # 4k, with subtitles + 'url': 'https://puhutv.com/dip-1-bolum-izle', + 'only_matching': True, + }] + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + 'عربى': 'ar' + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-izle' % display_id), + display_id)['data'] + + video_id = compat_str(info['id']) + title = info.get('name') or info['title']['name'] + if info.get('display_name'): + title = '%s %s' % (title, info.get('display_name')) + + try: + videos = self._download_json( + 'https://puhutv.com/api/assets/%s/videos' % video_id, + display_id, 'Downloading video JSON', + headers=self.geo_verification_headers()) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted() + raise + + formats = [] + for video in videos['data']['videos']: + media_url = url_or_none(video.get('url')) + if not media_url: + continue + playlist = video.get('is_playlist') + if video.get('stream_type') == 'hls' and playlist is True: + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = int_or_none(video.get('quality')) + f = { + 'url': media_url, + 'ext': 'mp4', + 'height': quality + } + video_format = video.get('video_format') + if video_format == 'hls' and playlist is False: + format_id = 'hls' + f['protocol'] = 'm3u8_native' + elif video_format == 'mp4': + format_id = 'http' + + else: + continue + if quality: + format_id += '-%sp' % quality + f['format_id'] = format_id + formats.append(f) + self._sort_formats(formats) + + description = try_get( + info, lambda x: x['title']['description'], + compat_str) or info.get('description') + timestamp = unified_timestamp(info.get('created_at')) + creator = try_get( + info, lambda x: x['title']['producer']['name'], compat_str) + + duration = float_or_none( + try_get(info, lambda x: x['content']['duration_in_ms'], int), + scale=1000) + view_count = try_get(info, lambda x: x['content']['watch_count'], int) + + images = try_get( + info, lambda x: x['content']['images']['wide'], dict) or {} + thumbnails = [] + for image_id, image_url in images.items(): + if not isinstance(image_url, compat_str): + continue + if not image_url.startswith(('http', '//')): + image_url = 'https://%s' % image_url + t = parse_resolution(image_id) + t.update({ + 'id': image_id, + 'url': image_url + }) + thumbnails.append(t) + + release_year = try_get(info, lambda x: x['title']['released_at'], int) + + season_number = int_or_none(info.get('season_number')) + season_id = str_or_none(info.get('season_id')) + episode_number = int_or_none(info.get('episode_number')) + + tags = [] + for genre in try_get(info, lambda x: x['title']['genres'], list) or []: + if not isinstance(genre, dict): + continue + genre_name = genre.get('name') + if genre_name and isinstance(genre_name, compat_str): + tags.append(genre_name) + + subtitles = {} + for subtitle in try_get( + info, lambda x: x['content']['subtitles'], list) or []: + if not isinstance(subtitle, dict): + continue + lang = subtitle.get('language') + sub_url = url_or_none(subtitle.get('url')) + if not lang or not isinstance(lang, compat_str) or not sub_url: + continue + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'url': sub_url + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'season_id': season_id, + 'season_number': season_number, + 'episode_number': episode_number, + 'release_year': release_year, + 'timestamp': timestamp, + 'creator': creator, + 'view_count': view_count, + 'duration': duration, + 'tags': tags, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'formats': formats + } + + +class PuhuTVSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay' + IE_NAME = 'puhutv:serie' + _TESTS = [{ + 'url': 'https://puhutv.com/deniz-yildizi-detay', + 'info_dict': { + 'title': 'Deniz Yıldızı', + 'id': 'deniz-yildizi', + }, + 'playlist_mincount': 205, + }, { + # a film detail page which is using same url with serie page + 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', + 'only_matching': True, + }] + + def _extract_entries(self, seasons): + for season in seasons: + season_id = season.get('id') + if not season_id: + continue + page = 1 + has_more = True + while has_more is True: + season = self._download_json( + 'https://galadriel.puhutv.com/seasons/%s' % season_id, + season_id, 'Downloading page %s' % page, query={ + 'page': page, + 'per': 40, + }) + episodes = season.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + slug_path = str_or_none(ep.get('slugPath')) + if not slug_path: + continue + video_id = str_or_none(int_or_none(ep.get('id'))) + yield self.url_result( + 'https://puhutv.com/%s' % slug_path, + ie=PuhuTVIE.ie_key(), video_id=video_id, + video_title=ep.get('name') or ep.get('eventLabel')) + page += 1 + has_more = season.get('hasMore') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-detay' % playlist_id), + playlist_id)['data'] + + seasons = info.get('seasons') + if seasons: + return self.playlist_result( + self._extract_entries(seasons), playlist_id, info.get('name')) + + # For films, these are using same url with series + video_id = info.get('slug') or info['assets'][0]['slug'] + return self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index b952e59b4..dd95f99f2 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -4,16 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( - xpath_text, - find_xpath_attr, determine_ext, + ExtractorError, int_or_none, unified_strdate, - xpath_element, - ExtractorError, - determine_protocol, - unsmuggle_url, ) @@ -49,107 +45,79 @@ class RadioCanadaIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + }, + { + # with protectionType but not actually DRM protected + 'url': 'radiocanada:toutv:140872', + 'info_dict': { + 'id': '140872', + 'title': 'Épisode 1', + 'series': 'District 31', + }, + 'only_matching': True, } ] + _GEO_COUNTRIES = ['CA'] + _access_token = None + _claims = None - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - app_code, video_id = re.match(self._VALID_URL, url).groups() - - metadata = self._download_xml( - 'http://api.radio-canada.ca/metaMedia/v1/index.ashx', - video_id, note='Downloading metadata XML', query={ + def _call_api(self, path, video_id=None, app_code=None, query=None): + if not query: + query = {} + query.update({ + 'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb', + 'output': 'json', + }) + if video_id: + query.update({ 'appCode': app_code, 'idMedia': video_id, }) + if self._access_token: + query['access_token'] = self._access_token + try: + return self._download_json( + 'https://services.radio-canada.ca/media/' + path, video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422): + data = self._parse_json(e.cause.read().decode(), None) + error = data.get('error_description') or data['errorMessage']['text'] + raise ExtractorError(error, expected=True) + raise + + def _extract_info(self, app_code, video_id): + metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas'] def get_meta(name): - el = find_xpath_attr(metadata, './/Meta', 'name', name) - return el.text if el is not None else None + for meta in metas: + if meta.get('name') == name: + text = meta.get('text') + if text: + return text + # protectionType does not necessarily mean the video is DRM protected (see + # https://github.com/rg3/youtube-dl/pull/18609). if get_meta('protectionType'): - raise ExtractorError('This video is DRM protected.', expected=True) + self.report_warning('This video is probably DRM protected.') - device_types = ['ipad'] - if not smuggled_data: - device_types.append('flash') - device_types.append('android') - - formats = [] - error = None - # TODO: extract f4m formats - # f4m formats can be extracted using flashhd device_type but they produce unplayable file - for device_type in device_types: - validation_url = 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx' - query = { - 'appCode': app_code, - 'idMedia': video_id, - 'connectionType': 'broadband', - 'multibitrate': 'true', - 'deviceType': device_type, - } - if smuggled_data: - validation_url = 'https://services.radio-canada.ca/media/validation/v2/' - query.update(smuggled_data) - else: - query.update({ - # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction - 'paysJ391wsHjbOJwvCs26toz': 'CA', - 'bypasslock': 'NZt5K62gRqfc', - }) - v_data = self._download_xml(validation_url, video_id, note='Downloading %s XML' % device_type, query=query, fatal=False) - v_url = xpath_text(v_data, 'url') - if not v_url: - continue - if v_url == 'null': - error = xpath_text(v_data, 'message') - continue - ext = determine_ext(v_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - v_url, video_id, f4m_id='hds', fatal=False)) - else: - ext = determine_ext(v_url) - bitrates = xpath_element(v_data, 'bitrates') - for url_e in bitrates.findall('url'): - tbr = int_or_none(url_e.get('bitrate')) - if not tbr: - continue - f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url) - protocol = determine_protocol({'url': f_url}) - f = { - 'format_id': '%s-%d' % (protocol, tbr), - 'url': f_url, - 'ext': 'flv' if protocol == 'rtmp' else ext, - 'protocol': protocol, - 'width': int_or_none(url_e.get('width')), - 'height': int_or_none(url_e.get('height')), - 'tbr': tbr, - } - mobj = re.match(r'(?P<url>rtmp://[^/]+/[^/]+)/(?P<playpath>[^?]+)(?P<auth>\?.+)', f_url) - if mobj: - f.update({ - 'url': mobj.group('url') + mobj.group('auth'), - 'play_path': mobj.group('playpath'), - }) - formats.append(f) - if protocol == 'rtsp': - base_url = self._search_regex( - r'rtsp://([^?]+)', f_url, 'base url', default=None) - if base_url: - base_url = 'http://' + base_url - formats.extend(self._extract_m3u8_formats( - base_url + '/playlist.m3u8', video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - base_url + '/manifest.f4m', video_id, - f4m_id='hds', fatal=False)) - if not formats and error: + query = { + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + } + if self._claims: + query['claims'] = self._claims + v_data = self._call_api('validation/v2/', video_id, app_code, query) + v_url = v_data.get('url') + if not v_url: + error = v_data['message'] + if error == "Le contenu sélectionné n'est pas disponible dans votre pays": + raise self.raise_geo_restricted(error, self._GEO_COUNTRIES) + if error == 'Le contenu sélectionné est disponible seulement en premium': + self.raise_login_required(error) raise ExtractorError( '%s said: %s' % (self.IE_NAME, error), expected=True) + formats = self._extract_m3u8_formats(v_url, video_id, 'mp4') self._sort_formats(formats) subtitles = {} @@ -174,11 +142,14 @@ class RadioCanadaIE(InfoExtractor): 'formats': formats, } + def _real_extract(self, url): + return self._extract_info(*re.match(self._VALID_URL, url).groups()) + class RadioCanadaAudioVideoIE(InfoExtractor): 'radiocanada:audiovideo' - _VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', 'info_dict': { 'id': '7527184', @@ -191,7 +162,10 @@ class RadioCanadaAudioVideoIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam', + 'only_matching': True, + }] def _real_extract(self, url): return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py index a53ad97a5..3f74f0c01 100644 --- a/youtube_dl/extractor/radiojavan.py +++ b/youtube_dl/extractor/radiojavan.py @@ -4,8 +4,11 @@ import re from .common import InfoExtractor from ..utils import ( - unified_strdate, + parse_resolution, str_to_int, + unified_strdate, + urlencode_postdata, + urljoin, ) @@ -29,13 +32,26 @@ class RadioJavanIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + download_host = self._download_json( + 'https://www.radiojavan.com/videos/video_host', video_id, + data=urlencode_postdata({'id': video_id}), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }).get('host', 'https://host1.rjmusicmedia.com') + webpage = self._download_webpage(url, video_id) - formats = [{ - 'url': 'https://media.rdjavan.com/media/music_video/%s' % video_path, - 'format_id': '%sp' % height, - 'height': int(height), - } for height, video_path in re.findall(r"RJ\.video(\d+)p\s*=\s*'/?([^']+)'", webpage)] + formats = [] + for format_id, _, video_path in re.findall( + r'RJ\.video(?P<format_id>\d+[pPkK])\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', + webpage): + f = parse_resolution(format_id) + f.update({ + 'url': urljoin(download_host, video_path), + 'format_id': format_id, + }) + formats.append(f) self._sort_formats(formats) title = self._og_search_title(webpage) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index d22311031..149153b8f 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -32,6 +32,9 @@ class RaiBaseIE(InfoExtractor): _GEO_BYPASS = False def _extract_relinker_info(self, relinker_url, video_id): + if not re.match(r'https?://', relinker_url): + return {'formats': [{'url': relinker_url}]} + formats = [] geoprotection = None is_live = None @@ -271,7 +274,6 @@ class RaiPlayPlaylistIE(InfoExtractor): ('programma', 'nomeProgramma'), webpage, 'title') description = unescapeHTML(self._html_search_meta( ('description', 'og:description'), webpage, 'description')) - print(description) entries = [] for mobj in re.finditer( @@ -286,7 +288,7 @@ class RaiPlayPlaylistIE(InfoExtractor): class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ # var uniquename = "ContentItem-..." # data-id="ContentItem-..." @@ -369,6 +371,13 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # Direct MMS URL + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', + 'only_matching': True, + }, { + 'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', + 'only_matching': True, }] def _extract_from_content_id(self, content_id, url): diff --git a/youtube_dl/extractor/raywenderlich.py b/youtube_dl/extractor/raywenderlich.py index 640c3ee23..5411ece21 100644 --- a/youtube_dl/extractor/raywenderlich.py +++ b/youtube_dl/extractor/raywenderlich.py @@ -4,24 +4,37 @@ import re from .common import InfoExtractor from .vimeo import VimeoIE +from ..compat import compat_str from ..utils import ( - extract_attributes, ExtractorError, - smuggle_url, - unsmuggle_url, + int_or_none, + merge_dicts, + try_get, + unescapeHTML, + unified_timestamp, urljoin, ) class RayWenderlichIE(InfoExtractor): - _VALID_URL = r'https?://videos\.raywenderlich\.com/courses/(?P<course_id>[^/]+)/lessons/(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<course_id>[^/]+)/lessons/(?P<id>\d+) + ''' _TESTS = [{ - 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1', 'info_dict': { 'id': '248377018', 'ext': 'mp4', - 'title': 'Testing In iOS Episode 1: Introduction', + 'title': 'Introduction', + 'description': 'md5:804d031b3efa9fcb49777d512d74f722', + 'timestamp': 1513906277, + 'upload_date': '20171222', 'duration': 133, 'uploader': 'Ray Wenderlich', 'uploader_id': 'user3304672', @@ -34,69 +47,133 @@ class RayWenderlichIE(InfoExtractor): 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'only_matching': True, + }] + + @staticmethod + def _extract_video_id(data, lesson_id): + if not data: + return + groups = try_get(data, lambda x: x['groups'], list) or [] + if not groups: + return + for group in groups: + if not isinstance(group, dict): + continue + contents = try_get(data, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + ordinal = int_or_none(content.get('ordinal')) + if ordinal != lesson_id: + continue + video_id = content.get('identifier') + if video_id: + return compat_str(video_id) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id, lesson_id = mobj.group('course_id', 'id') + display_id = '%s/%s' % (course_id, lesson_id) + + webpage = self._download_webpage(url, display_id) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image', webpage, 'thumbnail') + + if '>Subscribe to unlock' in webpage: + raise ExtractorError( + 'This content is only available for subscribers', + expected=True) + + info = { + 'thumbnail': thumbnail, + } + + vimeo_id = self._search_regex( + r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) + + if not vimeo_id: + data = self._parse_json( + self._search_regex( + r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, + 'data collection', default='{}', group='data'), + display_id, transform_source=unescapeHTML, fatal=False) + video_id = self._extract_video_id( + data, lesson_id) or self._search_regex( + r'/videos/(\d+)/', thumbnail, 'video id') + headers = { + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + csrf_token = self._html_search_meta( + 'csrf-token', webpage, 'csrf token', default=None) + if csrf_token: + headers['X-CSRF-Token'] = csrf_token + video = self._download_json( + 'https://videos.raywenderlich.com/api/v1/videos/%s.json' + % video_id, display_id, headers=headers)['video'] + vimeo_id = video['clips'][0]['provider_id'] + info.update({ + '_type': 'url_transparent', + 'title': video.get('name'), + 'description': video.get('description') or video.get( + 'meta_description'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('created_at')), + }) + + return merge_dicts(info, self.url_result( + VimeoIE._smuggle_referrer( + 'https://player.vimeo.com/video/%s' % vimeo_id, url), + ie=VimeoIE.ie_key(), video_id=vimeo_id)) + + +class RayWenderlichCourseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<id>[^/]+) + ''' + + _TEST = { + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios', 'info_dict': { 'title': 'Testing in iOS', - 'id': '105-testing-in-ios', + 'id': '3530-testing-in-ios', }, 'params': { 'noplaylist': False, }, 'playlist_count': 29, - }] + } + + @classmethod + def suitable(cls, url): + return False if RayWenderlichIE.suitable(url) else super( + RayWenderlichCourseIE, cls).suitable(url) def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) + course_id = self._match_id(url) - mobj = re.match(self._VALID_URL, url) - course_id, lesson_id = mobj.group('course_id', 'id') - video_id = '%s/%s' % (course_id, lesson_id) - - webpage = self._download_webpage(url, video_id) - - no_playlist = self._downloader.params.get('noplaylist') - if no_playlist or smuggled_data.get('force_video', False): - if no_playlist: - self.to_screen( - 'Downloading just video %s because of --no-playlist' - % video_id) - if '>Subscribe to unlock' in webpage: - raise ExtractorError( - 'This content is only available for subscribers', - expected=True) - vimeo_id = self._search_regex( - r'data-vimeo-id=["\'](\d+)', webpage, 'video id') - return self.url_result( - VimeoIE._smuggle_referrer( - 'https://player.vimeo.com/video/%s' % vimeo_id, url), - ie=VimeoIE.ie_key(), video_id=vimeo_id) - - self.to_screen( - 'Downloading playlist %s - add --no-playlist to just download video' - % course_id) - - lesson_ids = set((lesson_id, )) - for lesson in re.findall( - r'(<a[^>]+\bclass=["\']lesson-link[^>]+>)', webpage): - attrs = extract_attributes(lesson) - if not attrs: - continue - lesson_url = attrs.get('href') - if not lesson_url: - continue - lesson_id = self._search_regex( - r'/lessons/(\d+)', lesson_url, 'lesson id', default=None) - if not lesson_id: - continue - lesson_ids.add(lesson_id) + webpage = self._download_webpage(url, course_id) entries = [] - for lesson_id in sorted(lesson_ids): + lesson_urls = set() + for lesson_url in re.findall( + r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): + if lesson_url in lesson_urls: + continue + lesson_urls.add(lesson_url) entries.append(self.url_result( - smuggle_url(urljoin(url, lesson_id), {'force_video': True}), - ie=RayWenderlichIE.ie_key())) + urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) - title = self._search_regex( - r'class=["\']course-title[^>]+>([^<]+)', webpage, 'course title', - default=None) + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index 243603676..7e8d58f38 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -10,7 +10,7 @@ from ..utils import ( class RedBullTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull\.tv/video/(?P<id>AP-\w+)' + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com/(?:[^/]+/)?tv)/video/(?P<id>AP-\w+)' _TESTS = [{ # film 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', @@ -35,6 +35,9 @@ class RedBullTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 879bcf81d..10311a81a 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, str_to_int, unified_strdate, + url_or_none, ) @@ -71,8 +71,8 @@ class RedTubeIE(InfoExtractor): video_id, fatal=False) if medias and isinstance(medias, list): for media in medias: - format_url = media.get('videoUrl') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(media.get('videoUrl')) + if not format_url: continue format_id = media.get('quality') formats.append({ diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index 8bcf87126..7c8909d95 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -6,6 +6,7 @@ from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, + url_or_none, ) @@ -37,8 +38,8 @@ class RENTVIE(InfoExtractor): title = config['title'] formats = [] for video in config['src']: - src = video.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(video.get('src')) + if not src: continue ext = determine_ext(src) if ext == 'm3u8': diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py index e921ca3e6..c3623edcc 100644 --- a/youtube_dl/extractor/rmcdecouverte.py +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -1,38 +1,46 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE from ..compat import ( compat_parse_qs, compat_urlparse, ) +from ..utils import smuggle_url class RMCDecouverteIE(InfoExtractor): - _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)' + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:(?:[^/]+/)*program_(?P<id>\d+)|(?P<live_id>mediaplayer-direct))' - _TEST = { - 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET', + _TESTS = [{ + 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/', 'info_dict': { - 'id': '5419055995001', + 'id': '5983675500001', 'ext': 'mp4', - 'title': 'UN DELICIEUX PROJET', - 'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5', + 'title': 'CORVETTE', + 'description': 'md5:c1e8295521e45ffebf635d6a7658f506', 'uploader_id': '1969646226001', - 'upload_date': '20170502', - 'timestamp': 1493745308, + 'upload_date': '20181226', + 'timestamp': 1545861635, }, 'params': { 'skip_download': True, }, 'skip': 'only available for a week', - } + }, { + # live, geo restricted, bypassable + 'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') or mobj.group('live_id') + webpage = self._download_webpage(url, display_id) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) if brightcove_legacy_url: brightcove_id = compat_parse_qs(compat_urlparse.urlparse( @@ -41,5 +49,7 @@ class RMCDecouverteIE(InfoExtractor): brightcove_id = self._search_regex( r'data-video-id=["\'](\d+)', webpage, 'brightcove id') return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', - brightcove_id) + smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['FR']}), + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 28cc5522d..3b0f3080b 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - int_or_none, ExtractorError, + float_or_none, + int_or_none, + strip_or_none, ) @@ -14,20 +18,19 @@ class RTBFIE(InfoExtractor): (?: video/[^?]+\?.*\bid=| ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| - auvio/[^/]+\?.*id= + auvio/[^/]+\?.*\b(?P<live>l)?id= )(?P<id>\d+)''' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', + 'md5': '8c876a1cceeb6cf31b476461ade72384', 'info_dict': { 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'description': 'Football - Diables Rouges', - 'duration': 3099, + 'description': '(du 25/04/2014)', + 'duration': 3099.54, 'upload_date': '20140425', - 'timestamp': 1398456336, - 'uploader': 'rtbfsport', + 'timestamp': 1398456300, } }, { # geo restricted @@ -39,6 +42,18 @@ class RTBFIE(InfoExtractor): }, { 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, }] _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' _PROVIDERS = { @@ -53,46 +68,94 @@ class RTBFIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'http://www.rtbf.be/api/media/video?method=getVideoDetail&args[]=%s' % video_id, video_id) + live, media_id = re.match(self._VALID_URL, url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) error = data.get('error') if error: raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - data = data['data'] - provider = data.get('provider') if provider in self._PROVIDERS: return self.url_result(data['url'], self._PROVIDERS[provider]) + title = data['title'] + is_live = data.get('isLive') + if is_live: + title = self._live_title(title) + height_re = r'-(\d+)p\.' formats = [] - for key, format_id in self._QUALITIES: - format_url = data.get(key + 'Url') - if format_url: + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats[:]: + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': fix_url(format_url), + 'height': height, }) - thumbnails = [] - for thumbnail_id, thumbnail_url in data.get('thumbnail', {}).items(): - if thumbnail_id != 'default': - thumbnails.append({ - 'url': self._IMAGE_HOST + thumbnail_url, - 'id': thumbnail_id, - }) + mpd_url = data.get('urlDash') + if not data.get('drm') and mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) return { - 'id': video_id, + 'id': media_id, 'formats': formats, - 'title': data['title'], - 'description': data.get('description') or data.get('subtitle'), - 'thumbnails': thumbnails, - 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': int_or_none(data.get('created')), - 'view_count': int_or_none(data.get('viewCount')), - 'uploader': data.get('channel'), - 'tags': data.get('tags'), + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index a6fac6c35..1fbc72915 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -8,7 +8,10 @@ from ..compat import compat_HTTPError from ..utils import ( float_or_none, parse_iso8601, + str_or_none, + try_get, unescapeHTML, + url_or_none, ExtractorError, ) @@ -17,65 +20,87 @@ class RteBaseIE(InfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) - try: - json_string = self._download_json( - 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id, - item_id) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) - if error_info: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error_info['message']), - expected=True) - raise - - # NB the string values in the JSON are stored using XML escaping(!) - show = json_string['shows'][0] - title = unescapeHTML(show['title']) - description = unescapeHTML(show.get('description')) - thumbnail = show.get('thumbnail') - duration = float_or_none(show.get('duration'), 1000) - timestamp = parse_iso8601(show.get('published')) - - mg = show['media:group'][0] - + info_dict = {} formats = [] - if mg.get('url'): - m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url']) - if m: - m = m.groupdict() - formats.append({ - 'url': m['url'] + '/' + m['app'], - 'app': m['app'], - 'play_path': m['playpath'], - 'player_url': url, - 'ext': 'flv', - 'format_id': 'rtmp', - }) + ENDPOINTS = ( + 'https://feeds.rasset.ie/rteavgen/player/playlist?type=iptv&format=json&showId=', + 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=', + ) - if mg.get('hls_server') and mg.get('hls_url'): - formats.extend(self._extract_m3u8_formats( - mg['hls_server'] + mg['hls_url'], item_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + for num, ep_url in enumerate(ENDPOINTS, start=1): + try: + data = self._download_json(ep_url + item_id, item_id) + except ExtractorError as ee: + if num < len(ENDPOINTS) or formats: + continue + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) + if error_info: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_info['message']), + expected=True) + raise - if mg.get('hds_server') and mg.get('hds_url'): - formats.extend(self._extract_f4m_formats( - mg['hds_server'] + mg['hds_url'], item_id, - f4m_id='hds', fatal=False)) + # NB the string values in the JSON are stored using XML escaping(!) + show = try_get(data, lambda x: x['shows'][0], dict) + if not show: + continue + + if not info_dict: + title = unescapeHTML(show['title']) + description = unescapeHTML(show.get('description')) + thumbnail = show.get('thumbnail') + duration = float_or_none(show.get('duration'), 1000) + timestamp = parse_iso8601(show.get('published')) + info_dict = { + 'id': item_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + } + + mg = try_get(show, lambda x: x['media:group'][0], dict) + if not mg: + continue + + if mg.get('url'): + m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url']) + if m: + m = m.groupdict() + formats.append({ + 'url': m['url'] + '/' + m['app'], + 'app': m['app'], + 'play_path': m['playpath'], + 'player_url': url, + 'ext': 'flv', + 'format_id': 'rtmp', + }) + + if mg.get('hls_server') and mg.get('hls_url'): + formats.extend(self._extract_m3u8_formats( + mg['hls_server'] + mg['hls_url'], item_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + if mg.get('hds_server') and mg.get('hds_url'): + formats.extend(self._extract_f4m_formats( + mg['hds_server'] + mg['hds_url'], item_id, + f4m_id='hds', fatal=False)) + + mg_rte_server = str_or_none(mg.get('rte:server')) + mg_url = str_or_none(mg.get('url')) + if mg_rte_server and mg_url: + hds_url = url_or_none(mg_rte_server + mg_url) + if hds_url: + formats.extend(self._extract_f4m_formats( + hds_url, item_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) - return { - 'id': item_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } + info_dict['formats'] = formats + return info_dict class RteIE(RteBaseIE): diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 89d89b65a..8f54d5675 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -16,11 +16,22 @@ from ..utils import ( int_or_none, try_get, unified_timestamp, + url_or_none, ) class RutubeBaseIE(InfoExtractor): - def _extract_video(self, video, video_id=None, require_title=True): + def _download_api_info(self, video_id, query=None): + if not query: + query = {} + query['format'] = 'json' + return self._download_json( + 'http://rutube.ru/api/video/%s/' % video_id, + video_id, 'Downloading video JSON', + 'Unable to download video JSON', query=query) + + @staticmethod + def _extract_info(video, video_id=None, require_title=True): title = video['title'] if require_title else video.get('title') age_limit = video.get('is_adult') @@ -31,7 +42,7 @@ class RutubeBaseIE(InfoExtractor): category = try_get(video, lambda x: x['category']['name']) return { - 'id': video.get('id') or video_id, + 'id': video.get('id') or video_id if video_id else video['id'], 'title': title, 'description': video.get('description'), 'thumbnail': video.get('thumbnail_url'), @@ -46,6 +57,42 @@ class RutubeBaseIE(InfoExtractor): 'is_live': bool_or_none(video.get('is_livestream')), } + def _download_and_extract_info(self, video_id, query=None): + return self._extract_info( + self._download_api_info(video_id, query=query), video_id) + + def _download_api_options(self, video_id, query=None): + if not query: + query = {} + query['format'] = 'json' + return self._download_json( + 'http://rutube.ru/api/play/options/%s/' % video_id, + video_id, 'Downloading options JSON', + 'Unable to download options JSON', + headers=self.geo_verification_headers(), query=query) + + def _extract_formats(self, options, video_id): + formats = [] + for format_id, format_url in options['video_balancer'].items(): + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + return formats + + def _download_and_extract_formats(self, video_id, query=None): + return self._extract_formats( + self._download_api_options(video_id, query=query), video_id) + class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' @@ -54,13 +101,13 @@ class RutubeIE(RutubeBaseIE): _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', - 'md5': '79938ade01294ef7e27574890d0d3769', + 'md5': '1d24f180fac7a02f3900712e5a5764d6', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Раненный кенгуру забежал в аптеку', 'description': 'http://www.ntdtv.ru ', - 'duration': 80, + 'duration': 81, 'uploader': 'NTDRussian', 'uploader_id': '29790', 'timestamp': 1381943602, @@ -93,38 +140,12 @@ class RutubeIE(RutubeBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - - video = self._download_json( - 'http://rutube.ru/api/video/%s/?format=json' % video_id, - video_id, 'Downloading video JSON') - - info = self._extract_video(video, video_id) - - options = self._download_json( - 'http://rutube.ru/api/play/options/%s/?format=json' % video_id, - video_id, 'Downloading options JSON') - - formats = [] - for format_id, format_url in options['video_balancer'].items(): - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - info['formats'] = formats + info = self._download_and_extract_info(video_id) + info['formats'] = self._download_and_extract_formats(video_id) return info -class RutubeEmbedIE(InfoExtractor): +class RutubeEmbedIE(RutubeBaseIE): IE_NAME = 'rutube:embed' IE_DESC = 'Rutube embedded videos' _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)' @@ -133,7 +154,7 @@ class RutubeEmbedIE(InfoExtractor): 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', 'info_dict': { 'id': 'a10e53b86e8f349080f718582ce4c661', - 'ext': 'flv', + 'ext': 'mp4', 'timestamp': 1387830582, 'upload_date': '20131223', 'uploader_id': '297833', @@ -147,16 +168,26 @@ class RutubeEmbedIE(InfoExtractor): }, { 'url': 'http://rutube.ru/play/embed/8083783', 'only_matching': True, + }, { + # private video + 'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ', + 'only_matching': True, }] def _real_extract(self, url): embed_id = self._match_id(url) - webpage = self._download_webpage(url, embed_id) - - canonical_url = self._html_search_regex( - r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage, - 'Canonical URL') - return self.url_result(canonical_url, RutubeIE.ie_key()) + # Query may contain private videos token and should be passed to API + # requests (see #19163) + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + options = self._download_api_options(embed_id, query) + video_id = options['effective_video'] + formats = self._extract_formats(options, video_id) + info = self._download_and_extract_info(video_id, query) + info.update({ + 'extractor_key': 'Rutube', + 'formats': formats, + }) + return info class RutubePlaylistBaseIE(RutubeBaseIE): @@ -176,10 +207,10 @@ class RutubePlaylistBaseIE(RutubeBaseIE): break for result in results: - video_url = result.get('video_url') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(result.get('video_url')) + if not video_url: continue - entry = self._extract_video(result, require_title=False) + entry = self._extract_info(result, require_title=False) entry.update({ '_type': 'url', 'url': video_url, diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 9fa8688f8..f530f0083 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -65,7 +65,8 @@ class RuutuIE(InfoExtractor): video_id = self._match_id(url) video_xml = self._download_xml( - 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id, video_id) + 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, + query={'id': video_id}) formats = [] processed_urls = [] diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 30e2a38b4..c0d32a1b9 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -15,10 +15,10 @@ from ..utils import ( class SafariBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' + _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/' _NETRC_MACHINE = 'safari' - _API_BASE = 'https://www.safaribooksonline.com/api/v1' + _API_BASE = 'https://learning.oreilly.com/api/v1' _API_FORMAT = 'json' LOGGED_IN = False @@ -76,7 +76,7 @@ class SafariIE(SafariBaseIE): IE_DESC = 'safaribooksonline.com online video' _VALID_URL = r'''(?x) https?:// - (?:www\.)?safaribooksonline\.com/ + (?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/ (?: library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html| videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+) @@ -104,6 +104,9 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', + 'only_matching': True, }] _PARTNER_ID = '1926081' @@ -160,7 +163,7 @@ class SafariIE(SafariBaseIE): class SafariApiIE(SafariBaseIE): IE_NAME = 'safari:api' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', @@ -185,7 +188,7 @@ class SafariCourseIE(SafariBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?safaribooksonline\.com/ + (?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/ (?: library/view/[^/]+| api/v1/book| @@ -213,6 +216,9 @@ class SafariCourseIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', + 'only_matching': True, }] @classmethod diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py index 30f9cf824..21e44b69a 100644 --- a/youtube_dl/extractor/savefrom.py +++ b/youtube_dl/extractor/savefrom.py @@ -30,8 +30,5 @@ class SaveFromIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = os.path.splitext(url.split('/')[-1])[0] - return { - '_type': 'url', - 'id': video_id, - 'url': mobj.group('url'), - } + + return self.url_result(mobj.group('url'), video_id=video_id) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index 62a6a8337..69a0d01f3 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -90,6 +90,15 @@ class ScreencastIE(InfoExtractor): r'src=(.*?)(?:$|&)', video_meta, 'meta tag video URL', default=None) + if video_url is None: + video_url = self._html_search_regex( + r'MediaContentUrl["\']\s*:(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'video url', default=None, group='url') + + if video_url is None: + video_url = self._html_search_meta( + 'og:video', webpage, default=None) + if video_url is None: raise ExtractorError('Cannot find video') diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index 4023aeef8..8b3275735 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -19,7 +19,7 @@ class ScrippsNetworksWatchIE(AWSIE): _VALID_URL = r'''(?x) https?:// watch\. - (?P<site>hgtv|foodnetwork|travelchannel|diynetwork|cookingchanneltv|geniuskitchen)\.com/ + (?P<site>geniuskitchen)\.com/ (?: player\.[A-Z0-9]+\.html\#| show/(?:[^/]+/){2}| @@ -28,38 +28,23 @@ class ScrippsNetworksWatchIE(AWSIE): (?P<id>\d+) ''' _TESTS = [{ - 'url': 'http://watch.hgtv.com/show/HGTVE/Best-Ever-Treehouses/2241515/Best-Ever-Treehouses/', - 'md5': '26545fd676d939954c6808274bdb905a', + 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', 'info_dict': { - 'id': '4173834', + 'id': '4194875', 'ext': 'mp4', - 'title': 'Best Ever Treehouses', - 'description': "We're searching for the most over the top treehouses.", + 'title': 'Ample Hills Ice Cream Bike', + 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.', 'uploader': 'ANV', - 'upload_date': '20170922', - 'timestamp': 1506056400, + 'upload_date': '20171011', + 'timestamp': 1507698000, }, 'params': { 'skip_download': True, }, 'add_ie': [AnvatoIE.ie_key()], - }, { - 'url': 'http://watch.diynetwork.com/show/DSAL/Salvage-Dawgs/2656646/Covington-Church/', - 'only_matching': True, - }, { - 'url': 'http://watch.diynetwork.com/player.HNT.html#2656646', - 'only_matching': True, - }, { - 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', - 'only_matching': True, }] _SNI_TABLE = { - 'hgtv': 'hgtv', - 'diynetwork': 'diy', - 'foodnetwork': 'food', - 'cookingchanneltv': 'cook', - 'travelchannel': 'trav', 'geniuskitchen': 'genius', } diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index 6d4e3b76d..7a1c7e38b 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -164,6 +164,6 @@ class SeznamZpravyArticleIE(InfoExtractor): description = info.get('description') or self._og_search_description(webpage) return self.playlist_result([ - self.url_result(url, ie=SeznamZpravyIE.ie_key()) - for url in SeznamZpravyIE._extract_urls(webpage)], + self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) + for entry_url in SeznamZpravyIE._extract_urls(webpage)], article_id, title, description) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index b2250afdd..931a0f70e 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -5,6 +5,7 @@ from ..compat import compat_b64decode from ..utils import ( ExtractorError, int_or_none, + url_or_none, urlencode_postdata, ) @@ -86,9 +87,16 @@ class VivoIE(SharedBaseIE): } def _extract_video_url(self, webpage, video_id, *args): + def decode_url(encoded_url): + return compat_b64decode(encoded_url).decode('utf-8') + + stream_url = url_or_none(decode_url(self._search_regex( + r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'stream url', default=None, group='url'))) + if stream_url: + return stream_url return self._parse_json( self._search_regex( r'InitializeStream\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'stream', group='url'), - video_id, - transform_source=lambda x: compat_b64decode(x).decode('utf-8'))[0] + video_id, transform_source=decode_url)[0] diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 69951e387..0c4f865ef 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -19,29 +19,37 @@ from ..utils import ( class SixPlayIE(InfoExtractor): IE_NAME = '6play' - _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.6play.fr/le-meilleur-patissier-p_1807/le-meilleur-patissier-special-fetes-mercredi-a-21-00-sur-m6-c_11638450', - 'md5': '42310bffe4ba3982db112b9cd3467328', + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay\.be|play\.rtl\.hr)/.+?-c_)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', + 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', 'info_dict': { - 'id': '11638450', + 'id': '12041051', 'ext': 'mp4', - 'title': 'Le Meilleur Pâtissier, spécial fêtes mercredi à 21:00 sur M6', - 'description': 'md5:308853f6a5f9e2d55a30fc0654de415f', - 'duration': 39, - 'series': 'Le meilleur pâtissier', + 'title': 'Le but qui a marqué l\'histoire du football français !', + 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', }, - 'params': { - 'skip_download': True, - }, - } + }, { + 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', + 'only_matching': True, + }, { + 'url': 'https://play.rtl.hr/pj-masks-p_9455/epizoda-34-sezona-1-catboyevo-cudo-na-dva-kotaca-c_11984989', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.search(self._VALID_URL, url).groups() + service, consumer_name = { + '6play.fr': ('6play', 'm6web'), + 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + 'play.rtl.hr': ('rtlhr_rtl_play', 'rtlhr'), + }.get(domain, ('6play', 'm6web')) data = self._download_json( - 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/6play/videos/clip_%s' % video_id, - video_id, query={ + 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), + video_id, headers={ + 'x-customer-name': consumer_name + }, query={ 'csa': 5, 'with': 'clips', }) @@ -56,7 +64,7 @@ class SixPlayIE(InfoExtractor): for asset in clip_data['assets']: asset_url = asset.get('full_physical_path') protocol = asset.get('protocol') - if not asset_url or protocol == 'primetime' or asset_url in urls: + if not asset_url or protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264' or asset_url in urls: continue urls.append(asset_url) container = asset.get('video_container') @@ -65,20 +73,25 @@ class SixPlayIE(InfoExtractor): subtitles.setdefault('fr', []).append({'url': asset_url}) continue if container == 'm3u8' or ext == 'm3u8': - if protocol == 'usp' and not compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: - asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) - formats.extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - asset_url.replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - formats.extend(self._extract_mpd_formats( - asset_url.replace('.m3u8', '.mpd'), - video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_ism_formats( - re.sub(r'/[^/]+\.m3u8', '/Manifest', asset_url), - video_id, ism_id='mss', fatal=False)) + if protocol == 'usp': + if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + urlh = self._request_webpage( + asset_url, video_id, fatal=False, + headers=self.geo_verification_headers()) + if not urlh: + continue + asset_url = urlh.geturl() + for i in range(3, 0, -1): + asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) + m3u8_formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + formats.extend(self._extract_mpd_formats( + asset_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + if m3u8_formats: + break else: formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', diff --git a/youtube_dl/extractor/skylinewebcams.py b/youtube_dl/extractor/skylinewebcams.py index 5b4aaac6f..b7f8ac736 100644 --- a/youtube_dl/extractor/skylinewebcams.py +++ b/youtube_dl/extractor/skylinewebcams.py @@ -26,7 +26,7 @@ class SkylineWebcamsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) stream_url = self._search_regex( - r'url\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage, + r'(?:url|source)\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage, 'stream url', group='url') title = self._og_search_title(webpage) diff --git a/youtube_dl/extractor/slideslive.py b/youtube_dl/extractor/slideslive.py index 104576033..ed84322c5 100644 --- a/youtube_dl/extractor/slideslive.py +++ b/youtube_dl/extractor/slideslive.py @@ -8,6 +8,7 @@ from ..utils import ExtractorError class SlidesLiveIE(InfoExtractor): _VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)' _TESTS = [{ + # video_service_name = YOUTUBE 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', 'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f', 'info_dict': { @@ -19,14 +20,18 @@ class SlidesLiveIE(InfoExtractor): 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', 'upload_date': '20170925', } + }, { + # video_service_name = youtube + 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( url, video_id, headers={'Accept': 'application/json'}) - service_name = video_data['video_service_name'] - if service_name == 'YOUTUBE': + service_name = video_data['video_service_name'].lower() + if service_name == 'youtube': yt_video_id = video_data['video_service_id'] return self.url_result(yt_video_id, 'Youtube', video_id=yt_video_id) else: diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py index 6fc2ff60d..661f9e59d 100644 --- a/youtube_dl/extractor/slutload.py +++ b/youtube_dl/extractor/slutload.py @@ -1,12 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class SlutloadIE(InfoExtractor): - _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'md5': '868309628ba00fd488cf516a113fd717', @@ -16,33 +14,52 @@ class SlutloadIE(InfoExtractor): 'title': 'virginie baisee en cam', 'age_limit': 18, 'thumbnail': r're:https?://.*?\.jpg' - } + }, }, { # mobile site 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url) - webpage = self._download_webpage(desktop_url, video_id) + embed_page = self._download_webpage( + 'http://www.slutload.com/embed_player/%s' % video_id, video_id, + 'Downloading embed page', fatal=False) - video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>', - webpage, 'title').strip() + if embed_page: + def extract(what): + return self._html_search_regex( + r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what, + embed_page, 'video %s' % what, default=None, group='url') - video_url = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"', - webpage, 'video URL') - thumbnail = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"', - webpage, 'thumbnail', fatal=False) + video_url = extract('url') + if video_url: + title = self._html_search_regex( + r'<title>([^<]+)', embed_page, 'title', default=video_id) + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': extract('preview'), + 'age_limit': 18 + } - return { + webpage = self._download_webpage( + 'http://www.slutload.com/video/_/%s/' % video_id, video_id) + title = self._html_search_regex( + r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip() + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ 'id': video_id, - 'url': video_url, - 'title': video_title, - 'thumbnail': thumbnail, - 'age_limit': 18 - } + 'title': title, + 'age_limit': 18, + }) + return info diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 81c81c8d5..15da3496e 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -16,8 +16,10 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, - unified_strdate, + try_get, + unified_timestamp, update_url_query, + url_or_none, ) @@ -34,7 +36,7 @@ class SoundcloudIE(InfoExtractor): (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?!stations/track) (?P<uploader>[\w\d-]+)/ - (?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) + (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -50,12 +52,17 @@ class SoundcloudIE(InfoExtractor): 'info_dict': { 'id': '62986583', 'ext': 'mp3', - 'upload_date': '20121011', + 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'uploader': 'E.T. ExTerrestrial Music', - 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', + 'timestamp': 1349920598, + 'upload_date': '20121011', 'duration': 143, 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, } }, # not streamable song @@ -67,9 +74,14 @@ class SoundcloudIE(InfoExtractor): 'title': 'Goldrushed', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'uploader': 'The Royal Concept', + 'timestamp': 1337635207, 'upload_date': '20120521', - 'duration': 227, + 'duration': 30, 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, 'params': { # rtmp @@ -84,11 +96,16 @@ class SoundcloudIE(InfoExtractor): 'id': '123998367', 'ext': 'mp3', 'title': 'Youtube - Dl Test Video \'\' Ä↭', - 'uploader': 'jaimeMF', 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'timestamp': 1386604920, 'upload_date': '20131209', 'duration': 9, 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, # private link (alt format) @@ -99,11 +116,16 @@ class SoundcloudIE(InfoExtractor): 'id': '123998367', 'ext': 'mp3', 'title': 'Youtube - Dl Test Video \'\' Ä↭', - 'uploader': 'jaimeMF', 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'timestamp': 1386604920, 'upload_date': '20131209', 'duration': 9, 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, # downloadable song @@ -116,9 +138,14 @@ class SoundcloudIE(InfoExtractor): 'title': 'Bus Brakes', 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', 'uploader': 'oddsamples', + 'timestamp': 1389232924, 'upload_date': '20140109', 'duration': 17, 'license': 'cc-by-sa', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, # private link, downloadable format @@ -131,9 +158,14 @@ class SoundcloudIE(InfoExtractor): 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', 'uploader': 'Ori Uplift Music', + 'timestamp': 1504206263, 'upload_date': '20170831', 'duration': 7449, 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, # no album art, use avatar pic for thumbnail @@ -146,10 +178,15 @@ class SoundcloudIE(InfoExtractor): 'title': 'Sideways (Prod. Mad Real)', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'uploader': 'garyvee', + 'timestamp': 1488152409, 'upload_date': '20170226', 'duration': 207, 'thumbnail': r're:https?://.*\.jpg', 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, @@ -157,7 +194,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'LvWovRaJZlWCHql0bISuum8Bd2KX79mb' + _CLIENT_ID = 'NmW1FlPaiL94ueEu7oziOWjYEzZzQDcK' @staticmethod def _extract_urls(webpage): @@ -175,22 +212,33 @@ class SoundcloudIE(InfoExtractor): def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): track_id = compat_str(info['id']) + title = info['title'] name = full_title or track_id if quiet: self.report_extraction(name) thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') + username = try_get(info, lambda x: x['user']['username'], compat_str) + + def extract_count(key): + return int_or_none(info.get('%s_count' % key)) + result = { 'id': track_id, - 'uploader': info.get('user', {}).get('username'), - 'upload_date': unified_strdate(info.get('created_at')), - 'title': info['title'], + 'uploader': username, + 'timestamp': unified_timestamp(info.get('created_at')), + 'title': title, 'description': info.get('description'), 'thumbnail': thumbnail, 'duration': int_or_none(info.get('duration'), 1000), 'webpage_url': info.get('permalink_url'), 'license': info.get('license'), + 'view_count': extract_count('playback'), + 'like_count': extract_count('favoritings'), + 'comment_count': extract_count('comment'), + 'repost_count': extract_count('reposts'), + 'genre': info.get('genre'), } formats = [] query = {'client_id': self._CLIENT_ID} @@ -368,7 +416,6 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): - _API_BASE = 'https://api.soundcloud.com' _API_V2_BASE = 'https://api-v2.soundcloud.com' def _extract_playlist(self, base_url, playlist_id, playlist_title): @@ -389,21 +436,30 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): next_href, playlist_id, 'Downloading track page %s' % (i + 1)) collection = response['collection'] - if not collection: - break - def resolve_permalink_url(candidates): + if not isinstance(collection, list): + collection = [] + + # Empty collection may be returned, in this case we proceed + # straight to next_href + + def resolve_entry(candidates): for cand in candidates: - if isinstance(cand, dict): - permalink_url = cand.get('permalink_url') - entry_id = self._extract_id(cand) - if permalink_url and permalink_url.startswith('http'): - return permalink_url, entry_id + if not isinstance(cand, dict): + continue + permalink_url = url_or_none(cand.get('permalink_url')) + if not permalink_url: + continue + return self.url_result( + permalink_url, + ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, + video_id=self._extract_id(cand), + video_title=cand.get('title')) for e in collection: - permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) - if permalink_url: - entries.append(self.url_result(permalink_url, video_id=entry_id)) + entry = resolve_entry((e, e.get('track'), e.get('playlist'))) + if entry: + entries.append(entry) next_href = response.get('next_href') if not next_href: @@ -429,46 +485,53 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): (?:(?:www|m)\.)?soundcloud\.com/ (?P<user>[^/]+) (?:/ - (?P<rsrc>tracks|sets|reposts|likes|spotlight) + (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight) )? /?(?:[?#].*)?$ ''' IE_NAME = 'soundcloud:user' _TESTS = [{ - 'url': 'https://soundcloud.com/the-akashic-chronicler', + 'url': 'https://soundcloud.com/soft-cell-official', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (All)', + 'id': '207965082', + 'title': 'Soft Cell (All)', }, - 'playlist_mincount': 74, + 'playlist_mincount': 28, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', + 'url': 'https://soundcloud.com/soft-cell-official/tracks', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Tracks)', + 'id': '207965082', + 'title': 'Soft Cell (Tracks)', }, - 'playlist_mincount': 37, + 'playlist_mincount': 27, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', + 'url': 'https://soundcloud.com/soft-cell-official/albums', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Playlists)', + 'id': '207965082', + 'title': 'Soft Cell (Albums)', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://soundcloud.com/jcv246/sets', + 'info_dict': { + 'id': '12982173', + 'title': 'Jordi / cv (Playlists)', }, 'playlist_mincount': 2, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', + 'url': 'https://soundcloud.com/jcv246/reposts', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Reposts)', + 'id': '12982173', + 'title': 'Jordi / cv (Reposts)', }, - 'playlist_mincount': 7, + 'playlist_mincount': 6, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', + 'url': 'https://soundcloud.com/clalberg/likes', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Likes)', + 'id': '11817582', + 'title': 'clalberg (Likes)', }, - 'playlist_mincount': 321, + 'playlist_mincount': 5, }, { 'url': 'https://soundcloud.com/grynpyret/spotlight', 'info_dict': { @@ -479,10 +542,11 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): }] _BASE_URL_MAP = { - 'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE, + 'all': '%s/stream/users/%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'albums': '%s/users/%%s/albums' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'reposts': '%s/stream/users/%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, } @@ -490,6 +554,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): _TITLE_MAP = { 'all': 'All', 'tracks': 'Tracks', + 'albums': 'Albums', 'sets': 'Playlists', 'reposts': 'Reposts', 'likes': 'Likes', diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 67500b69c..fbe6ef31a 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, + orderedSet, parse_duration, parse_resolution, str_to_int, @@ -12,7 +13,7 @@ from ..utils import ( class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video' + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/(?:video|play|embed)\b' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -41,13 +42,22 @@ class SpankBangIE(InfoExtractor): # 4k 'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k', 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/', + 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play', + 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2y3td/embed/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, headers={ - 'Cookie': 'country=US' - }) + webpage = self._download_webpage( + url.replace('/%s/embed' % video_id, '/%s/video' % video_id), + video_id, headers={'Cookie': 'country=US'}) if re.search(r'<[^>]+\bid=["\']video_removed', webpage): raise ExtractorError( @@ -94,3 +104,33 @@ class SpankBangIE(InfoExtractor): 'formats': formats, 'age_limit': age_limit, } + + +class SpankBangPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/[^/]+' + _TEST = { + 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', + 'info_dict': { + 'id': 'ug0k', + 'title': 'Big Ass Titties', + }, + 'playlist_mincount': 50, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage( + url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) + + entries = [self.url_result( + 'https://spankbang.com/%s/video' % video_id, + ie=SpankBangIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'<a[^>]+\bhref=["\']/?([\da-z]+)/play/', webpage))] + + title = self._html_search_regex( + r'<h1>([^<]+)\s+playlist</h1>', webpage, 'playlist title', + fatal=False) + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index e76522b45..6090e0066 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -44,3 +44,10 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] + + def _extract_mgid(self, webpage): + cs = self._parse_json(self._search_regex( + r'window\.__DATA__\s*=\s*({.+})', + webpage, 'data'), None)['children'] + c = next(c for c in cs if c.get('type') == 'VideoPlayer') + return c['props']['media']['video']['config']['uri'] diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 54497c880..b9017fd2a 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -8,20 +8,24 @@ from ..utils import ( determine_ext, int_or_none, js_to_json, + merge_dicts, ) -class SportBoxEmbedIE(InfoExtractor): - _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' +class SportBoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', 'info_dict': { - 'id': '211355', + 'id': '109158', 'ext': 'mp4', - 'title': '211355', + 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'description': 'В Новороссийске прошел детский турнир «Поле славы боевой»', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 292, 'view_count': int, + 'timestamp': 1426237001, + 'upload_date': '20150313', }, 'params': { # m3u8 download @@ -33,12 +37,18 @@ class SportBoxEmbedIE(InfoExtractor): }, { 'url': 'https://news.sportbox.ru/vdl/player/media/193095', 'only_matching': True, + }, { + 'url': 'https://news.sportbox.ru/vdl/player/media/109158', + 'only_matching': True, + }, { + 'url': 'https://matchtv.ru/vdl/player/media/109158', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"', + r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"', webpage) def _real_extract(self, url): @@ -46,13 +56,14 @@ class SportBoxEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - wjplayer_data = self._parse_json( + sources = self._parse_json( self._search_regex( - r'(?s)wjplayer\(({.+?})\);', webpage, 'wjplayer settings'), + r'(?s)playerOptions\.sources(?:WithRes)?\s*=\s*(\[.+?\])\s*;\s*\n', + webpage, 'sources'), video_id, transform_source=js_to_json) formats = [] - for source in wjplayer_data['sources']: + for source in sources: src = source.get('src') if not src: continue @@ -66,14 +77,23 @@ class SportBoxEmbedIE(InfoExtractor): }) self._sort_formats(formats) + player = self._parse_json( + self._search_regex( + r'(?s)playerOptions\s*=\s*({.+?})\s*;\s*\n', webpage, + 'player options', default='{}'), + video_id, transform_source=js_to_json) + media_id = player['mediaId'] + + info = self._search_json_ld(webpage, media_id, default={}) + view_count = int_or_none(self._search_regex( r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None)) - return { - 'id': video_id, - 'title': video_id, - 'thumbnail': wjplayer_data.get('poster'), - 'duration': int_or_none(wjplayer_data.get('duration')), + return merge_dicts(info, { + 'id': media_id, + 'title': self._og_search_title(webpage, default=None) or media_id, + 'thumbnail': player.get('poster'), + 'duration': int_or_none(player.get('duration')), 'view_count': view_count, 'formats': formats, - } + }) diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py index fcaa5ac0b..efb259f96 100644 --- a/youtube_dl/extractor/streamango.py +++ b/youtube_dl/extractor/streamango.py @@ -14,7 +14,7 @@ from ..utils import ( class StreamangoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net)/(?:f|embed)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', 'md5': 'e992787515a182f55e38fc97588d802a', @@ -38,6 +38,9 @@ class StreamangoIE(InfoExtractor): }, { 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', 'only_matching': True, + }, { + 'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index 6a6bb90c4..4a410611d 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -72,4 +72,7 @@ class StreamcloudIE(InfoExtractor): 'title': title, 'url': video_url, 'thumbnail': thumbnail, + 'http_headers': { + 'Referer': url, + }, } diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index f71eab8b2..0901c3163 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -12,6 +12,8 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + orderedSet, + strip_or_none, try_get, urljoin, compat_str, @@ -137,7 +139,12 @@ class SVTPlayBaseIE(SVTBaseIE): class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)' + _VALID_URL = r'''(?x) + (?: + svt:(?P<svt_id>[^/?#&]+)| + https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -164,10 +171,40 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/kanaler/svt1', 'only_matching': True, + }, { + 'url': 'svt:1376446-003A', + 'only_matching': True, + }, { + 'url': 'svt:14278044', + 'only_matching': True, }] + def _adjust_title(self, info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + + def _extract_by_video_id(self, video_id, webpage=None): + data = self._download_json( + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, + video_id, headers=self.geo_verification_headers()) + info_dict = self._extract_video(data, video_id) + if not info_dict.get('title'): + title = dict_get(info_dict, ('episode', 'series')) + if not title and webpage: + title = re.sub( + r'\s*\|\s*.+?$', '', self._og_search_title(webpage)) + if not title: + title = video_id + info_dict['title'] = title + self._adjust_title(info_dict) + return info_dict + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, svt_id = mobj.group('id', 'svt_id') + + if svt_id: + return self._extract_by_video_id(svt_id) webpage = self._download_webpage(url, video_id) @@ -179,10 +216,6 @@ class SVTPlayIE(SVTPlayBaseIE): thumbnail = self._og_search_thumbnail(webpage) - def adjust_title(info): - if info['is_live']: - info['title'] = self._live_title(info['title']) - if data: video_info = try_get( data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], @@ -193,24 +226,14 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) - adjust_title(info_dict) + self._adjust_title(info_dict) return info_dict - video_id = self._search_regex( + svt_id = self._search_regex( r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', - webpage, 'video id', default=None) + webpage, 'video id') - if video_id: - data = self._download_json( - 'https://api.svt.se/videoplayer-api/video/%s' % video_id, - video_id, headers=self.geo_verification_headers()) - info_dict = self._extract_video(data, video_id) - if not info_dict.get('title'): - info_dict['title'] = re.sub( - r'\s*\|\s*.+?$', '', - info_dict.get('episode') or self._og_search_title(webpage)) - adjust_title(info_dict) - return info_dict + return self._extract_by_video_id(svt_id, webpage) class SVTSeriesIE(SVTPlayBaseIE): @@ -292,3 +315,57 @@ class SVTSeriesIE(SVTPlayBaseIE): return self.playlist_result( entries, series_id, title, metadata.get('description')) + + +class SVTPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'info_dict': { + 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'info_dict': { + 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', + }, + 'playlist_count': 1, + }, { + # only programTitle + 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', + 'info_dict': { + 'id': '2900353', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', + 'duration': 27, + 'age_limit': 0, + }, + }, { + 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1', + 'only_matching': True, + }, { + 'url': 'https://www.svt.se/vader/manadskronikor/maj2018', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-video-id=["\'](\d+)', webpage))] + + title = strip_or_none(self._og_search_title(webpage, default=None)) + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index 784f8ed66..e8a7c65e0 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -16,7 +16,7 @@ from ..utils import ( class TBSIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com(?P<path>/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+))' _TESTS = [{ 'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster', 'info_dict': { @@ -40,12 +40,12 @@ class TBSIE(TurnerBaseIE): }] def _real_extract(self, url): - site, display_id = re.match(self._VALID_URL, url).groups() + site, path, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) drupal_settings = self._parse_json(self._search_regex( r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', webpage, 'drupal setting'), display_id) - video_data = drupal_settings['turner_playlist'][0] + video_data = next(v for v in drupal_settings['turner_playlist'] if v.get('url') == path) media_id = video_data['mediaID'] title = video_data['title'] diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/teachable.py similarity index 52% rename from youtube_dl/extractor/upskill.py rename to youtube_dl/extractor/teachable.py index 30297b4dd..c1a9deafe 100644 --- a/youtube_dl/extractor/upskill.py +++ b/youtube_dl/extractor/teachable.py @@ -14,20 +14,39 @@ from ..utils import ( ) -class UpskillBaseIE(InfoExtractor): - _LOGIN_URL = 'http://upskillcourses.com/sign_in' - _NETRC_MACHINE = 'upskill' +class TeachableBaseIE(InfoExtractor): + _NETRC_MACHINE = 'teachable' + _URL_PREFIX = 'teachable:' + + _SITES = { + # Only notable ones here + 'upskillcourses.com': 'upskill', + 'academy.gns3.com': 'gns3', + 'academyhacker.com': 'academyhacker', + 'stackskills.com': 'stackskills', + 'market.saleshacker.com': 'saleshacker', + 'learnability.org': 'learnability', + 'edurila.com': 'edurila', + 'courses.workitdaily.com': 'workitdaily', + } + + _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys())) def _real_initialize(self): - self._login() + self._logged_in = False - def _login(self): - username, password = self._get_login_info() + def _login(self, site): + if self._logged_in: + return + + username, password = self._get_login_info( + netrc_machine=self._SITES.get(site, site)) if username is None: return login_page, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, 'Downloading login page') + 'https://%s/sign_in' % site, None, + 'Downloading %s login page' % site) login_url = compat_str(urlh.geturl()) @@ -46,18 +65,24 @@ class UpskillBaseIE(InfoExtractor): post_url = urljoin(login_url, post_url) response = self._download_webpage( - post_url, None, 'Logging in', + post_url, None, 'Logging in to %s' % site, data=urlencode_postdata(login_form), headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': login_url, }) + if '>I accept the new Privacy Policy<' in response: + raise ExtractorError( + 'Unable to login: %s asks you to accept new Privacy Policy. ' + 'Go to https://%s/ and accept.' % (site, site), expected=True) + # Successful login if any(re.search(p, response) for p in ( r'class=["\']user-signout', r'<a[^>]+\bhref=["\']/sign_out', r'>\s*Log out\s*<')): + self._logged_in = True return message = get_element_by_class('alert', response) @@ -68,8 +93,14 @@ class UpskillBaseIE(InfoExtractor): raise ExtractorError('Unable to log in') -class UpskillIE(UpskillBaseIE): - _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P<id>\d+)' +class TeachableIE(TeachableBaseIE): + _VALID_URL = r'''(?x) + (?: + %shttps?://(?P<site_t>[^/]+)| + https?://(?:www\.)?(?P<site>%s) + ) + /courses/[^/]+/lectures/(?P<id>\d+) + ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', @@ -77,7 +108,7 @@ class UpskillIE(UpskillBaseIE): 'id': 'uzw6zw58or', 'ext': 'mp4', 'title': 'Welcome to the Course!', - 'description': 'md5:8d66c13403783370af62ca97a7357bdd', + 'description': 'md5:65edb0affa582974de4625b9cdea1107', 'duration': 138.763, 'timestamp': 1479846621, 'upload_date': '20161122', @@ -88,10 +119,37 @@ class UpskillIE(UpskillBaseIE): }, { 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', 'only_matching': True, + }, { + 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939', + 'only_matching': True, + }, { + 'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'only_matching': True, }] + @staticmethod + def _is_teachable(webpage): + return 'teachableTracker.linker:autoLink' in webpage and re.search( + r'<link[^>]+href=["\']https?://process\.fs\.teachablecdn\.com', + webpage) + + @staticmethod + def _extract_url(webpage, source_url): + if not TeachableIE._is_teachable(webpage): + return + if re.match(r'https?://[^/]+/(?:courses|p)', source_url): + return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') or mobj.group('site_t') + video_id = mobj.group('id') + + self._login(site) + + prefixed = url.startswith(self._URL_PREFIX) + if prefixed: + url = url[len(self._URL_PREFIX):] webpage = self._download_webpage(url, video_id) @@ -113,12 +171,18 @@ class UpskillIE(UpskillBaseIE): } -class UpskillCourseIE(UpskillBaseIE): - _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P<id>[^/?#&]+)' +class TeachableCourseIE(TeachableBaseIE): + _VALID_URL = r'''(?x) + (?: + %shttps?://(?P<site_t>[^/]+)| + https?://(?:www\.)?(?P<site>%s) + ) + /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+) + ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', 'info_dict': { - 'id': '119763', + 'id': 'essential-web-developer-course', 'title': 'The Essential Web Developer Course (Free)', }, 'playlist_count': 192, @@ -128,21 +192,37 @@ class UpskillCourseIE(UpskillBaseIE): }, { 'url': 'http://upskillcourses.com/courses/enrolled/119763', 'only_matching': True, + }, { + 'url': 'https://academy.gns3.com/courses/enrolled/423415', + 'only_matching': True, + }, { + 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', + 'only_matching': True, + }, { + 'url': 'teachable:https://filmsimplified.com/p/davinci-resolve-15-crash-course', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return False if UpskillIE.suitable(url) else super( - UpskillCourseIE, cls).suitable(url) + return False if TeachableIE.suitable(url) else super( + TeachableCourseIE, cls).suitable(url) def _real_extract(self, url): - course_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') or mobj.group('site_t') + course_id = mobj.group('id') + + self._login(site) + + prefixed = url.startswith(self._URL_PREFIX) + if prefixed: + prefix = self._URL_PREFIX + url = url[len(prefix):] webpage = self._download_webpage(url, course_id) - course_id = self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id', - default=course_id) + url_base = 'https://%s/' % site entries = [] @@ -162,10 +242,13 @@ class UpskillCourseIE(UpskillBaseIE): title = self._html_search_regex( r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li, 'title', default=None) + entry_url = urljoin(url_base, lecture_url) + if prefixed: + entry_url = self._URL_PREFIX + entry_url entries.append( self.url_result( - urljoin('http://upskillcourses.com/', lecture_url), - ie=UpskillIE.ie_key(), video_id=lecture_id, + entry_url, + ie=TeachableIE.ie_key(), video_id=lecture_id, video_title=clean_html(title))) course_title = self._html_search_regex( diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 06a27fd04..645942dfd 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -7,8 +7,10 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + float_or_none, int_or_none, try_get, + url_or_none, ) @@ -30,7 +32,7 @@ class TEDIE(InfoExtractor): ''' _TESTS = [{ 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', - 'md5': '0de43ac406aa3e4ea74b66c9c7789b13', + 'md5': 'b0ce2b05ca215042124fbc9e3886493a', 'info_dict': { 'id': '102', 'ext': 'mp4', @@ -42,24 +44,30 @@ class TEDIE(InfoExtractor): 'uploader': 'Dan Dennett', 'width': 853, 'duration': 1308, - } - }, { - 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', - 'md5': 'b899ac15e345fb39534d913f7606082b', - 'info_dict': { - 'id': 'tSVI8ta_P4w', - 'ext': 'mp4', - 'title': 'Vishal Sikka: The beauty and power of algorithms', - 'thumbnail': r're:^https?://.+\.jpg', - 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', - 'upload_date': '20140122', - 'uploader_id': 'TEDInstitute', - 'uploader': 'TED Institute', + 'view_count': int, + 'comment_count': int, + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + # missing HTTP bitrates + 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', + 'info_dict': { + 'id': '6069', + 'ext': 'mp4', + 'title': 'The beauty and power of algorithms', + 'thumbnail': r're:^https?://.+\.jpg', + 'description': 'md5:734e352710fb00d840ab87ae31aaf688', + 'uploader': 'Vishal Sikka', + }, + 'params': { + 'skip_download': True, }, - 'add_ie': ['Youtube'], }, { 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', - 'md5': '71b3ab2f4233012dce09d515c9c39ce2', + 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', 'info_dict': { 'id': '1972', 'ext': 'mp4', @@ -68,6 +76,9 @@ class TEDIE(InfoExtractor): 'description': 'md5:5174aed4d0f16021b704120360f72b92', 'duration': 1128, }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.ted.com/playlists/who_are_the_hackers', 'info_dict': { @@ -92,17 +103,17 @@ class TEDIE(InfoExtractor): 'skip_download': True, }, }, { - # YouTube video - 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', - 'add_ie': ['Youtube'], + # no nativeDownloads + 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', 'info_dict': { - 'id': 'aFBIPO-P7LM', + 'id': '1792', 'ext': 'mp4', - 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', - 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', - 'uploader': 'TEDx Talks', - 'uploader_id': 'TEDxTalks', - 'upload_date': '20111216', + 'title': 'The orchestra in my mouth', + 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', + 'uploader': 'Tom Thum', + 'view_count': int, + 'comment_count': int, + 'tags': list, }, 'params': { 'skip_download': True, @@ -161,27 +172,16 @@ class TEDIE(InfoExtractor): info = self._extract_info(webpage) - talk_info = try_get( - info, lambda x: x['__INITIAL_DATA__']['talks'][0], - dict) or info['talks'][0] + data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info + talk_info = data['talks'][0] title = talk_info['title'].strip() - external = talk_info.get('external') - if external: - service = external['service'] - self.to_screen('Found video from %s' % service) - ext_url = None - if service.lower() == 'youtube': - ext_url = external.get('code') - return { - '_type': 'url', - 'url': ext_url or external['uri'], - } - native_downloads = try_get( - talk_info, lambda x: x['downloads']['nativeDownloads'], - dict) or talk_info['nativeDownloads'] + talk_info, + (lambda x: x['downloads']['nativeDownloads'], + lambda x: x['nativeDownloads']), + dict) or {} formats = [{ 'url': format_url, @@ -196,6 +196,16 @@ class TEDIE(InfoExtractor): player_talk = talk_info['player_talks'][0] + external = player_talk.get('external') + if isinstance(external, dict): + service = external.get('service') + if isinstance(service, compat_str): + ext_url = None + if service.lower() == 'youtube': + ext_url = external.get('code') + + return self.url_result(ext_url or external['uri']) + resources_ = player_talk.get('resources') or talk_info.get('resources') http_url = None @@ -228,8 +238,14 @@ class TEDIE(InfoExtractor): 'tbr': int_or_none(resource.get('bitrate')), }) elif format_id == 'hls': + if not isinstance(resources, dict): + continue + stream_url = url_or_none(resources.get('stream')) + if not stream_url: + continue formats.extend(self._extract_m3u8_formats( - resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) + stream_url, video_name, 'mp4', m3u8_id=format_id, + fatal=False)) m3u8_formats = list(filter( lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', @@ -239,12 +255,18 @@ class TEDIE(InfoExtractor): bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) if not bitrate: continue + bitrate_url = re.sub(r'\d+k', bitrate, http_url) + if not self._is_valid_url( + bitrate_url, video_name, '%s bitrate' % bitrate): + continue f = m3u8_format.copy() f.update({ - 'url': re.sub(r'\d+k', bitrate, http_url), + 'url': bitrate_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) + if f.get('acodec') == 'none': + del f['acodec'] formats.append(f) audio_download = talk_info.get('audioDownload') @@ -267,7 +289,11 @@ class TEDIE(InfoExtractor): 'description': self._og_search_description(webpage), 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, - 'duration': talk_info.get('duration'), + 'duration': float_or_none(talk_info.get('duration')), + 'view_count': int_or_none(data.get('viewed_count')), + 'comment_count': int_or_none( + try_get(data, lambda x: x['comments']['count'])), + 'tags': try_get(talk_info, lambda x: x['tags'], list), } def _get_subtitles(self, video_id, talk_info): diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py new file mode 100644 index 000000000..25573e49f --- /dev/null +++ b/youtube_dl/extractor/tele5.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .nexx import NexxIE +from ..compat import compat_urlparse + + +class Tele5IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:mediathek|tv)/(?P<id>[^?#&]+)' + _TESTS = [{ + 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416', + 'info_dict': { + 'id': '1549416', + 'ext': 'mp4', + 'upload_date': '20180814', + 'timestamp': 1534290623, + 'title': 'Pandorum', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.tele5.de/tv/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/tv/dark-matter/videos', + 'only_matching': True, + }] + + def _real_extract(self, url): + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] + + if not video_id: + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)', + webpage, 'video id') + + return self.url_result( + 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id, + ie=NexxIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index fdcc7d573..d37e1b055 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -1,26 +1,43 @@ # coding: utf-8 from __future__ import unicode_literals -from .mitele import MiTeleBaseIE +import json +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + str_or_none, + urljoin, +) -class TelecincoIE(MiTeleBaseIE): +class TelecincoIE(InfoExtractor): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', - 'md5': '8d7b2d5f699ee2709d992a63d5cd1712', 'info_dict': { - 'id': 'JEA5ijCnF6p5W08A1rNKn7', - 'ext': 'mp4', + 'id': '1876350223', 'title': 'Bacalao con kokotxas al pil-pil', 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', - 'duration': 662, }, + 'playlist': [{ + 'md5': 'adb28c37238b675dad0f042292f209a7', + 'info_dict': { + 'id': 'JEA5ijCnF6p5W08A1rNKn7', + 'ext': 'mp4', + 'title': 'Con Martín Berasategui, hacer un bacalao al pil-pil es fácil y divertido', + 'duration': 662, + }, + }] }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '284393e5387b3b947b77c613ef04749a', + 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43', 'info_dict': { 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', @@ -30,7 +47,7 @@ class TelecincoIE(MiTeleBaseIE): }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': '749afab6ea5a136a8806855166ae46a2', + 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', @@ -50,17 +67,90 @@ class TelecincoIE(MiTeleBaseIE): 'only_matching': True, }] + def _parse_content(self, content, url): + video_id = content['dataMediaId'] + if content.get('dataCmsId') == 'ooyala': + return self.url_result( + 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id) + config_url = urljoin(url, content['dataConfig']) + config = self._download_json( + config_url, video_id, 'Downloading config JSON') + title = config['info']['title'] + + def mmc_url(mmc_type): + return re.sub( + r'/(?:flash|html5)\.json', '/%s.json' % mmc_type, + config['services']['mmc']) + + duration = None + formats = [] + for mmc_type in ('flash', 'html5'): + mmc = self._download_json( + mmc_url(mmc_type), video_id, + 'Downloading %s mmc JSON' % mmc_type, fatal=False) + if not mmc: + continue + if not duration: + duration = int_or_none(mmc.get('duration')) + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + gcp = location.get('gcp') + ogn = location.get('ogn') + if None in (gat, gcp, ogn): + continue + token_data = { + 'gcp': gcp, + 'ogn': ogn, + 'sta': 0, + } + media = self._download_json( + gat, video_id, data=json.dumps(token_data).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }, fatal=False) or {} + stream = media.get('stream') or media.get('file') + if not stream: + continue + ext = determine_ext(stream) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), + 'duration': duration, + } + def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, 'title') - info = self._get_player_info(url, webpage) + article = self._parse_json(self._search_regex( + r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})', + webpage, 'article'), display_id)['article'] + title = article.get('title') + description = clean_html(article.get('leadParagraph')) + if article.get('editorialType') != 'VID': + entries = [] + for p in article.get('body', []): + content = p.get('content') + if p.get('type') != 'video' or not content: + continue + entries.append(self._parse_content(content, url)) + return self.playlist_result( + entries, str_or_none(article.get('id')), title, description) + content = article['opening']['content'] + info = self._parse_content(content, url) info.update({ - 'display_id': display_id, - 'title': title, - 'description': self._html_search_meta( - ['og:description', 'twitter:description'], - webpage, 'title', fatal=False), + 'description': description, }) return info diff --git a/youtube_dl/extractor/testurl.py b/youtube_dl/extractor/testurl.py index 46918adb0..84a14a0bd 100644 --- a/youtube_dl/extractor/testurl.py +++ b/youtube_dl/extractor/testurl.py @@ -61,8 +61,4 @@ class TestURLIE(InfoExtractor): self.to_screen('Test URL: %s' % tc['url']) - return { - '_type': 'url', - 'url': tc['url'], - 'id': video_id, - } + return self.url_result(tc['url'], video_id=video_id) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e595c4a69..903f47380 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -19,6 +19,7 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'info_dict': { diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index b1a985ff6..90b351cbb 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -32,14 +32,24 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): + _TP_TLD = 'com' + def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml( smil_url, video_id, note=note, query={'format': 'SMIL'}, headers=self.geo_verification_headers()) error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') - if error_element is not None and error_element.attrib['src'].startswith( - 'http://link.theplatform.com/s/errorFiles/Unavailable.'): - raise ExtractorError(error_element.attrib['abstract'], expected=True) + if error_element is not None: + exception = find_xpath_attr( + error_element, _x('.//smil:param'), 'name', 'exception') + if exception is not None: + if exception.get('value') == 'GeoLocationBlocked': + self.raise_geo_restricted(error_element.attrib['abstract']) + elif error_element.attrib['src'].startswith( + 'http://link.theplatform.%s/s/errorFiles/Unavailable.' + % self._TP_TLD): + raise ExtractorError( + error_element.attrib['abstract'], expected=True) smil_formats = self._parse_smil_formats( meta, smil_url, video_id, namespace=default_ns, @@ -66,7 +76,7 @@ class ThePlatformBaseIE(OnceIE): return formats, subtitles def _download_theplatform_metadata(self, path, video_id): - info_url = 'http://link.theplatform.com/s/%s?format=preview' % path + info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path) return self._download_json(info_url, video_id) def _parse_theplatform_metadata(self, info): @@ -308,7 +318,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): class ThePlatformFeedIE(ThePlatformBaseIE): _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s' - _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))' + _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))' _TESTS = [{ # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', @@ -325,12 +335,15 @@ class ThePlatformFeedIE(ThePlatformBaseIE): 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], 'uploader': 'NBCU-NEWS', }, + }, { + 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01', + 'only_matching': True, }] def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None): real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query) entry = self._download_json(real_url, video_id)['entries'][0] - main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else None + main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else entry.get('plmedia$publicUrl') formats = [] subtitles = {} @@ -343,7 +356,8 @@ class ThePlatformFeedIE(ThePlatformBaseIE): if first_video_id is None: first_video_id = cur_video_id duration = float_or_none(item.get('plfile$duration')) - for asset_type in item['plfile$assetTypes']: + file_asset_types = item.get('plfile$assetTypes') or compat_parse_qs(compat_urllib_parse_urlparse(smil_url).query)['assetTypes'] + for asset_type in file_asset_types: if asset_type in asset_types: continue asset_types.append(asset_type) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py new file mode 100644 index 000000000..083e9f36d --- /dev/null +++ b/youtube_dl/extractor/tiktok.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + compat_str, + ExtractorError, + int_or_none, + str_or_none, + try_get, + url_or_none, +) + + +class TikTokBaseIE(InfoExtractor): + def _extract_aweme(self, data): + video = data['video'] + description = str_or_none(try_get(data, lambda x: x['desc'])) + width = int_or_none(try_get(data, lambda x: video['width'])) + height = int_or_none(try_get(data, lambda x: video['height'])) + + format_urls = set() + formats = [] + for format_id in ( + 'play_addr_lowbr', 'play_addr', 'play_addr_h264', + 'download_addr'): + for format in try_get( + video, lambda x: x[format_id]['url_list'], list) or []: + format_url = url_or_none(format) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'height': height, + 'width': width, + }) + self._sort_formats(formats) + + thumbnail = url_or_none(try_get( + video, lambda x: x['cover']['url_list'][0], compat_str)) + uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) + timestamp = int_or_none(data.get('create_time')) + comment_count = int_or_none(data.get('comment_count')) or int_or_none( + try_get(data, lambda x: x['statistics']['comment_count'])) + repost_count = int_or_none(try_get( + data, lambda x: x['statistics']['share_count'])) + + aweme_id = data['aweme_id'] + + return { + 'id': aweme_id, + 'title': uploader or aweme_id, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'timestamp': timestamp, + 'comment_count': comment_count, + 'repost_count': repost_count, + 'formats': formats, + } + + +class TikTokIE(TikTokBaseIE): + _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>\d+)' + _TEST = { + 'url': 'https://m.tiktok.com/v/6606727368545406213.html', + 'md5': 'd584b572e92fcd48888051f238022420', + 'info_dict': { + 'id': '6606727368545406213', + 'ext': 'mp4', + 'title': 'Zureeal', + 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', + 'thumbnail': r're:^https?://.*~noop.image', + 'uploader': 'Zureeal', + 'timestamp': 1538248586, + 'upload_date': '20180929', + 'comment_count': int, + 'repost_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._parse_json(self._search_regex( + r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id) + return self._extract_aweme(data) + + +class TikTokUserIE(TikTokBaseIE): + _VALID_URL = r'https?://(?:m\.)?tiktok\.com/h5/share/usr/(?P<id>\d+)' + _TEST = { + 'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html', + 'info_dict': { + 'id': '188294915489964032', + }, + 'playlist_mincount': 24, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + data = self._download_json( + 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id, + query={'_signature': '_'}) + entries = [] + for aweme in data['aweme_list']: + try: + entry = self._extract_aweme(aweme) + except ExtractorError: + continue + entry['extractor_key'] = TikTokIE.ie_key() + entries.append(entry) + return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 0c2f8f119..b3573c6e0 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -18,8 +18,9 @@ from ..utils import ( class TNAFlixNetworkBaseIE(InfoExtractor): # May be overridden in descendants if necessary _CONFIG_REGEX = [ - r'flashvars\.config\s*=\s*escape\("([^"]+)"', - r'<input[^>]+name="config\d?" value="([^"]+)"', + r'flashvars\.config\s*=\s*escape\("(?P<url>[^"]+)"', + r'<input[^>]+name="config\d?" value="(?P<url>[^"]+)"', + r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1', ] _HOST = 'tna' _VKEY_SUFFIX = '' @@ -85,7 +86,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): webpage = self._download_webpage(url, display_id) cfg_url = self._proto_relative_url(self._html_search_regex( - self._CONFIG_REGEX, webpage, 'flashvars.config', default=None), 'http:') + self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, + group='url'), 'http:') if not cfg_url: inputs = self._hidden_inputs(webpage) @@ -94,7 +96,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): cfg_xml = self._download_xml( cfg_url, display_id, 'Downloading metadata', - transform_source=fix_xml_ampersands) + transform_source=fix_xml_ampersands, headers={'Referer': url}) formats = [] diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 2e7876cc5..f1ab91cf2 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -3,22 +3,19 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .radiocanada import RadioCanadaIE from ..utils import ( - int_or_none, - js_to_json, - urlencode_postdata, extract_attributes, - smuggle_url, + int_or_none, + merge_dicts, + urlencode_postdata, ) -class TouTvIE(InfoExtractor): +class TouTvIE(RadioCanadaIE): _NETRC_MACHINE = 'toutv' IE_NAME = 'tou.tv' _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)' - _access_token = None - _claims = None _TESTS = [{ 'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17', @@ -46,18 +43,14 @@ class TouTvIE(InfoExtractor): email, password = self._get_login_info() if email is None: return - state = 'http://ici.tou.tv/' - webpage = self._download_webpage(state, None, 'Downloading homepage') - toutvlogin = self._parse_json(self._search_regex( - r'(?s)toutvlogin\s*=\s*({.+?});', webpage, 'toutvlogin'), None, js_to_json) - authorize_url = toutvlogin['host'] + '/auth/oauth/v2/authorize' login_webpage = self._download_webpage( - authorize_url, None, 'Downloading login page', query={ - 'client_id': toutvlogin['clientId'], - 'redirect_uri': 'https://ici.tou.tv/login/loginCallback', + 'https://services.radio-canada.ca/auth/oauth/v2/authorize', + None, 'Downloading login page', query={ + 'client_id': '4dd36440-09d5-4468-8923-b6d91174ad36', + 'redirect_uri': 'https://ici.tou.tv/logincallback', 'response_type': 'token', - 'scope': 'media-drmt openid profile email id.write media-validation.read.privileged', - 'state': state, + 'scope': 'id.write media-validation.read', + 'state': '/', }) def extract_form_url_and_data(wp, default_form_url, form_spec_re=''): @@ -86,12 +79,7 @@ class TouTvIE(InfoExtractor): self._access_token = self._search_regex( r'access_token=([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', urlh.geturl(), 'access token') - self._claims = self._download_json( - 'https://services.radio-canada.ca/media/validation/v2/getClaims', - None, 'Extracting Claims', query={ - 'token': self._access_token, - 'access_token': self._access_token, - })['claims'] + self._claims = self._call_api('validation/v2/getClaims')['claims'] def _real_extract(self, url): path = self._match_id(url) @@ -102,19 +90,10 @@ class TouTvIE(InfoExtractor): self.report_warning('This video is probably DRM protected.', path) video_id = metadata['IdMedia'] details = metadata['Details'] - title = details['OriginalTitle'] - video_url = 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id) - if self._access_token and self._claims: - video_url = smuggle_url(video_url, { - 'access_token': self._access_token, - 'claims': self._claims, - }) - return { - '_type': 'url_transparent', - 'url': video_url, + return merge_dicts({ 'id': video_id, - 'title': title, + 'title': details.get('OriginalTitle'), 'thumbnail': details.get('ImageUrl'), 'duration': int_or_none(details.get('LengthInSeconds')), - } + }, self._extract_info(metadata.get('AppCode', 'toutv'), video_id)) diff --git a/youtube_dl/extractor/trunews.py b/youtube_dl/extractor/trunews.py new file mode 100644 index 000000000..b0c7caabf --- /dev/null +++ b/youtube_dl/extractor/trunews.py @@ -0,0 +1,75 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + dict_get, + float_or_none, + int_or_none, + unified_timestamp, + update_url_query, + url_or_none, +) + + +class TruNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?trunews\.com/stream/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.trunews.com/stream/will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', + 'md5': 'a19c024c3906ff954fac9b96ce66bb08', + 'info_dict': { + 'id': '5c5a21e65d3c196e1c0020cc', + 'display_id': 'will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', + 'ext': 'mp4', + 'title': "Will Democrats Stage a Circus During President Trump's State of the Union Speech?", + 'description': 'md5:c583b72147cc92cf21f56a31aff7a670', + 'duration': 3685, + 'timestamp': 1549411440, + 'upload_date': '20190206', + }, + 'add_ie': ['Zype'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + video = self._download_json( + 'https://api.zype.com/videos', display_id, query={ + 'app_key': 'PUVKp9WgGUb3-JUw6EqafLx8tFVP6VKZTWbUOR-HOm__g4fNDt1bCsm_LgYf_k9H', + 'per_page': 1, + 'active': 'true', + 'friendly_title': display_id, + })['response'][0] + + zype_id = video['_id'] + + thumbnails = [] + thumbnails_list = video.get('thumbnails') + if isinstance(thumbnails_list, list): + for thumbnail in thumbnails_list: + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + '_type': 'url_transparent', + 'url': update_url_query( + 'https://player.zype.com/embed/%s.js' % zype_id, + {'api_key': 'X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt'}), + 'ie_key': 'Zype', + 'id': zype_id, + 'display_id': display_id, + 'title': video.get('title'), + 'description': dict_get(video, ('description', 'ott_description', 'short_description')), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('published_at')), + 'average_rating': float_or_none(video.get('rating')), + 'view_count': int_or_none(video.get('request_count')), + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/trutv.py b/youtube_dl/extractor/trutv.py index 3a5782525..ce892c8c5 100644 --- a/youtube_dl/extractor/trutv.py +++ b/youtube_dl/extractor/trutv.py @@ -4,44 +4,72 @@ from __future__ import unicode_literals import re from .turner import TurnerBaseIE +from ..utils import ( + int_or_none, + parse_iso8601, +) class TruTVIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?trutv\.com(?:(?P<path>/shows/[^/]+/videos/[^/?#]+?)\.html|/full-episodes/[^/]+/(?P<id>\d+))' + _VALID_URL = r'https?://(?:www\.)?trutv\.com/(?:shows|full-episodes)/(?P<series_slug>[0-9A-Za-z-]+)/(?:videos/(?P<clip_slug>[0-9A-Za-z-]+)|(?P<id>\d+))' _TEST = { - 'url': 'http://www.trutv.com/shows/10-things/videos/you-wont-believe-these-sports-bets.html', - 'md5': '2cdc844f317579fed1a7251b087ff417', + 'url': 'https://www.trutv.com/shows/the-carbonaro-effect/videos/sunlight-activated-flower.html', 'info_dict': { - 'id': '/shows/10-things/videos/you-wont-believe-these-sports-bets', + 'id': 'f16c03beec1e84cd7d1a51f11d8fcc29124cc7f1', 'ext': 'mp4', - 'title': 'You Won\'t Believe These Sports Bets', - 'description': 'Jamie Lee sits down with a bookie to discuss the bizarre world of illegal sports betting.', - 'upload_date': '20130305', - } + 'title': 'Sunlight-Activated Flower', + 'description': "A customer is stunned when he sees Michael's sunlight-activated flower.", + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): - path, video_id = re.match(self._VALID_URL, url).groups() - auth_required = False - if path: - data_src = 'http://www.trutv.com/video/cvp/v2/xml/content.xml?id=%s.xml' % path + series_slug, clip_slug, video_id = re.match(self._VALID_URL, url).groups() + + if video_id: + path = 'episode' + display_id = video_id else: - webpage = self._download_webpage(url, video_id) - video_id = self._search_regex( - r"TTV\.TVE\.episodeId\s*=\s*'([^']+)';", - webpage, 'video id', default=video_id) - auth_required = self._search_regex( - r'TTV\.TVE\.authRequired\s*=\s*(true|false);', - webpage, 'auth required', default='false') == 'true' - data_src = 'http://www.trutv.com/tveverywhere/services/cvpXML.do?titleId=' + video_id - return self._extract_cvp_info( - data_src, path, { - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/trutv/big', - 'tokenizer_src': 'http://www.trutv.com/tveverywhere/processors/services/token_ipadAdobe.do', - }, - }, { + path = 'series/clip' + display_id = clip_slug + + data = self._download_json( + 'https://api.trutv.com/v2/web/%s/%s/%s' % (path, series_slug, display_id), + display_id) + video_data = data['episode'] if video_id else data['info'] + media_id = video_data['mediaId'] + title = video_data['title'].strip() + + info = self._extract_ngtv_info( + media_id, {}, { 'url': url, 'site_name': 'truTV', - 'auth_required': auth_required, + 'auth_required': video_data.get('isAuthRequired'), }) + + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('srcUrl') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + info.update({ + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(video_data.get('publicationDate')), + 'series': video_data.get('showTitle'), + 'season_number': int_or_none(video_data.get('seasonNum')), + 'episode_number': int_or_none(video_data.get('episodeNum')), + }) + return info diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 368c45729..db93b0182 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -45,7 +45,7 @@ class Tube8IE(KeezMoviesIE): r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') description = self._html_search_regex( - r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False) + r'(?s)Description:</dt>\s*<dd>(.+?)</dd>', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'<span class="username">\s*(.+?)\s*<', webpage, 'uploader', fatal=False) @@ -55,19 +55,19 @@ class Tube8IE(KeezMoviesIE): dislike_count = int_or_none(self._search_regex( r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False)) view_count = str_to_int(self._search_regex( - r'<strong>Views: </strong>([\d,\.]+)\s*</li>', + r'Views:\s*</dt>\s*<dd>([\d,\.]+)', webpage, 'view count', fatal=False)) comment_count = str_to_int(self._search_regex( r'<span id="allCommentsCount">(\d+)</span>', webpage, 'comment count', fatal=False)) category = self._search_regex( - r'Category:\s*</strong>\s*<a[^>]+href=[^>]+>([^<]+)', + r'Category:\s*</dt>\s*<dd>\s*<a[^>]+href=[^>]+>([^<]+)', webpage, 'category', fatal=False) categories = [category] if category else None tags_str = self._search_regex( - r'(?s)Tags:\s*</strong>(.+?)</(?!a)', + r'(?s)Tags:\s*</dt>\s*<dd>(.+?)</(?!a)', webpage, 'tags', fatal=False) tags = [t for t in re.findall( r'<a[^>]+href=[^>]+>([^<]+)', tags_str)] if tags_str else None diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 2b7b0d6e1..4a6cbfbb8 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -15,6 +15,7 @@ from ..utils import ( update_url_query, ExtractorError, strip_or_none, + url_or_none, ) @@ -154,8 +155,8 @@ class TurnerBaseIE(AdobePassIE): subtitles = {} for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): - track_url = track.get('url') - if not isinstance(track_url, compat_str) or track_url.endswith('/big'): + track_url = url_or_none(track.get('url')) + if not track_url or track_url.endswith('/big'): continue lang = track.get('lang') or track.get('label') or 'en' subtitles.setdefault(lang, []).append({ diff --git a/youtube_dl/extractor/tv3.py b/youtube_dl/extractor/tv3.py deleted file mode 100644 index 3867ec90d..000000000 --- a/youtube_dl/extractor/tv3.py +++ /dev/null @@ -1,34 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class TV3IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv3\.co\.nz/(?P<id>[^/]+)/tabid/\d+/articleID/\d+/MCat/\d+/Default\.aspx' - _TEST = { - 'url': 'http://www.tv3.co.nz/MOTORSPORT-SRS-SsangYong-Hampton-Downs-Round-3/tabid/3692/articleID/121615/MCat/2915/Default.aspx', - 'info_dict': { - 'id': '4659127992001', - 'ext': 'mp4', - 'title': 'CRC Motorsport: SRS SsangYong Hampton Downs Round 3 - S2015 Ep3', - 'description': 'SsangYong Racing Series returns for Round 3 with drivers from New Zealand and Australia taking to the grid at Hampton Downs raceway.', - 'uploader_id': '3812193411001', - 'upload_date': '20151213', - 'timestamp': 1449975272, - }, - 'expected_warnings': [ - 'Failed to download MPD manifest' - ], - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/3812193411001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - brightcove_id = self._search_regex(r'<param\s*name="@videoPlayer"\s*value="(\d+)"', webpage, 'brightcove id') - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py index 2b2630b91..4222ff9ee 100644 --- a/youtube_dl/extractor/tvnet.py +++ b/youtube_dl/extractor/tvnet.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, unescapeHTML, + url_or_none, ) @@ -106,9 +106,8 @@ class TVNetIE(InfoExtractor): for stream in self._download_json(data_file, video_id): if not isinstance(stream, dict): continue - stream_url = stream.get('url') - if (stream_url in stream_urls or not stream_url or - not isinstance(stream_url, compat_str)): + stream_url = url_or_none(stream.get('url')) + if stream_url in stream_urls or not stream_url: continue stream_urls.add(stream_url) formats.extend(self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 808571ece..3c6a60c39 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -10,8 +10,9 @@ from ..utils import ( int_or_none, parse_iso8601, parse_duration, - try_get, + str_or_none, update_url_query, + urljoin, ) @@ -19,39 +20,53 @@ class TVNowBaseIE(InfoExtractor): _VIDEO_FIELDS = ( 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', - 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', - 'format.defaultImage169Logo') + 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear', + 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo') def _call_api(self, path, video_id, query): return self._download_json( - 'https://api.tvnow.de/v3/' + path, - video_id, query=query) + 'https://api.tvnow.de/v3/' + path, video_id, query=query) def _extract_video(self, info, display_id): video_id = compat_str(info['id']) title = info['title'] - mpd_url = info['manifest']['dashclear'] - if not mpd_url: + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + formats = self._extract_mpd_formats( + url_repl('dash', '.mpd'), video_id, + mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + url_repl('hss', 'Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + url_repl('hls', '.m3u8'), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + if formats: + break + else: if info.get('isDrm'): raise ExtractorError( 'Video %s is DRM protected' % video_id, expected=True) if info.get('geoblocked'): - raise ExtractorError( - 'Video %s is not available from your location due to geo restriction' % video_id, - expected=True) + raise self.raise_geo_restricted() if not info.get('free', True): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - - mpd_url = update_url_query(mpd_url, {'filter': ''}) - formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False) - formats.extend(self._extract_ism_formats( - mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'), - video_id, ism_id='mss', fatal=False)) - formats.extend(self._extract_m3u8_formats( - mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) description = info.get('articleLong') or info.get('articleShort') @@ -88,11 +103,16 @@ class TVNowBaseIE(InfoExtractor): class TVNowIE(TVNowBaseIE): _VALID_URL = r'''(?x) https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/ (?P<show_id>[^/]+)/ (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+) ''' + @classmethod + def suitable(cls, url): + return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) or TVNowShowIE.suitable(url) + else super(TVNowIE, cls).suitable(url)) + _TESTS = [{ 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player', 'info_dict': { @@ -101,7 +121,6 @@ class TVNowIE(TVNowBaseIE): 'ext': 'mp4', 'title': 'Der neue Porsche 911 GT 3', 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', - 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1495994400, 'upload_date': '20170528', 'duration': 5283, @@ -140,7 +159,8 @@ class TVNowIE(TVNowBaseIE): }] def _real_extract(self, url): - display_id = '%s/%s' % re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + display_id = '%s/%s' % mobj.group(2, 3) info = self._call_api( 'movies/' + display_id, display_id, query={ @@ -150,130 +170,309 @@ class TVNowIE(TVNowBaseIE): return self._extract_video(info, display_id) -class TVNowListBaseIE(TVNowBaseIE): - _SHOW_VALID_URL = r'''(?x) - (?P<base_url> - https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ - (?P<show_id>[^/]+) - ) +class TVNowNewIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?P<base_url>https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/ + (?:shows|serien))/ + (?P<show>[^/]+)-\d+/ + [^/]+/ + episode-\d+-(?P<episode>[^/?$&]+)-(?P<id>\d+) ''' - def _extract_list_info(self, display_id, show_id): - fields = list(self._SHOW_FIELDS) - fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) - fields.extend( - 'formatTabs.formatTabPages.container.movies.%s' % field - for field in self._VIDEO_FIELDS) - return self._call_api( - 'formats/seo', display_id, query={ - 'fields': ','.join(fields), - 'name': show_id + '.php' - }) - - -class TVNowListIE(TVNowListBaseIE): - _VALID_URL = r'%s/(?:list|jahr)/(?P<id>[^?\#&]+)' % TVNowListBaseIE._SHOW_VALID_URL - - _SHOW_FIELDS = ('title', ) - _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) - _VIDEO_FIELDS = ('id', 'headline', 'seoUrl', ) - _TESTS = [{ - 'url': 'https://www.tvnow.de/rtl/30-minuten-deutschland/list/aktuell', - 'info_dict': { - 'id': '28296', - 'title': '30 Minuten Deutschland - Aktuell', - }, - 'playlist_mincount': 1, - }, { - 'url': 'https://www.tvnow.de/vox/ab-ins-beet/list/staffel-14', - 'only_matching': True, - }, { - 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/2018/3', + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return (False if TVNowIE.suitable(url) - else super(TVNowListIE, cls).suitable(url)) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url')) + show, episode = mobj.group('show', 'episode') + return self.url_result( + # Rewrite new URLs to the old format and use extraction via old API + # at api.tvnow.de as a loophole for bypassing premium content checks + '%s/%s/%s' % (base_url, show, episode), + ie=TVNowIE.ie_key(), video_id=mobj.group('id')) + + +class TVNowNewBaseIE(InfoExtractor): + def _call_api(self, path, video_id, query={}): + result = self._download_json( + 'https://apigw.tvnow.de/module/' + path, video_id, query=query) + error = result.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + return result + + +""" +TODO: new apigw.tvnow.de based version of TVNowIE. Replace old TVNowIE with it +when api.tvnow.de is shut down. This version can't bypass premium checks though. +class TVNowIE(TVNowNewBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/ + (?:shows|serien)/[^/]+/ + (?:[^/]+/)+ + (?P<display_id>[^/?$&]+)-(?P<id>\d+) + ''' + + _TESTS = [{ + # episode with annual navigation + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', + 'info_dict': { + 'id': '331082', + 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'ext': 'mp4', + 'title': 'Der neue Porsche 911 GT 3', + 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1495994400, + 'upload_date': '20170528', + 'duration': 5283, + 'series': 'GRIP - Das Motormagazin', + 'season_number': 14, + 'episode_number': 405, + 'episode': 'Der neue Porsche 911 GT 3', + }, + }, { + # rtl2, episode with season navigation + 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471/staffel-3/episode-14-bernd-steht-seit-der-trennung-von-seiner-frau-allein-da-526124', + 'only_matching': True, + }, { + # rtlnitro + 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13/episode-5-auf-eigene-faust-pilot-366822', + 'only_matching': True, + }, { + # superrtl + 'url': 'https://www.tvnow.de/shows/die-lustigsten-schlamassel-der-welt-1221/staffel-2/episode-14-u-a-ketchup-effekt-364120', + 'only_matching': True, + }, { + # ntv + 'url': 'https://www.tvnow.de/shows/startup-news-10674/staffel-2/episode-39-goetter-in-weiss-387630', + 'only_matching': True, + }, { + # vox + 'url': 'https://www.tvnow.de/shows/auto-mobil-174/2017-11/episode-46-neues-vom-automobilmarkt-2017-11-19-17-00-00-380072', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', + 'only_matching': True, + }] + + def _extract_video(self, info, url, display_id): + config = info['config'] + source = config['source'] + + video_id = compat_str(info.get('id') or source['videoId']) + title = source['title'].strip() + + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + formats = self._extract_mpd_formats( + url_repl('dash', '.mpd'), video_id, + mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + url_repl('hss', 'Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + url_repl('hls', '.m3u8'), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + if formats: + break + else: + if try_get(info, lambda x: x['rights']['isDrm']): + raise ExtractorError( + 'Video %s is DRM protected' % video_id, expected=True) + if try_get(config, lambda x: x['boards']['geoBlocking']['block']): + raise self.raise_geo_restricted() + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + self._sort_formats(formats) + + description = source.get('description') + thumbnail = url_or_none(source.get('poster')) + timestamp = unified_timestamp(source.get('previewStart')) + duration = parse_duration(source.get('length')) + + series = source.get('format') + season_number = int_or_none(self._search_regex( + r'staffel-(\d+)', url, 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'episode-(\d+)', url, 'episode number', default=None)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, + 'episode': title, + 'formats': formats, + } def _real_extract(self, url): - base_url, show_id, season_id = re.match(self._VALID_URL, url).groups() + display_id, video_id = re.match(self._VALID_URL, url).groups() + info = self._call_api('player/' + video_id, video_id) + return self._extract_video(info, video_id, display_id) +""" - list_info = self._extract_list_info(season_id, show_id) - season = next( - season for season in list_info['formatTabs']['items'] - if season.get('seoheadline') == season_id) +class TVNowListBaseIE(TVNowNewBaseIE): + _SHOW_VALID_URL = r'''(?x) + (?P<base_url> + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/(?:shows|serien)/ + [^/?#&]+-(?P<show_id>\d+) + ) + ''' - title = list_info.get('title') - headline = season.get('headline') - if title and headline: - title = '%s - %s' % (title, headline) - else: - title = headline or title + @classmethod + def suitable(cls, url): + return (False if TVNowNewIE.suitable(url) + else super(TVNowListBaseIE, cls).suitable(url)) + + def _extract_items(self, url, show_id, list_id, query): + items = self._call_api( + 'teaserrow/format/episode/' + show_id, list_id, + query=query)['items'] entries = [] - for container in season['formatTabPages']['items']: - items = try_get( - container, lambda x: x['container']['movies']['items'], - list) or [] - for info in items: - seo_url = info.get('seoUrl') - if not seo_url: - continue - video_id = info.get('id') - entries.append(self.url_result( - '%s/%s/player' % (base_url, seo_url), TVNowIE.ie_key(), - compat_str(video_id) if video_id else None)) + for item in items: + if not isinstance(item, dict): + continue + item_url = urljoin(url, item.get('url')) + if not item_url: + continue + video_id = str_or_none(item.get('id') or item.get('videoId')) + item_title = item.get('subheadline') or item.get('text') + entries.append(self.url_result( + item_url, ie=TVNowNewIE.ie_key(), video_id=video_id, + video_title=item_title)) - return self.playlist_result( - entries, compat_str(season.get('id') or season_id), title) + return self.playlist_result(entries, '%s/%s' % (show_id, list_id)) + + +class TVNowSeasonIE(TVNowListBaseIE): + _VALID_URL = r'%s/staffel-(?P<id>\d+)' % TVNowListBaseIE._SHOW_VALID_URL + _TESTS = [{ + 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13', + 'info_dict': { + 'id': '1815/13', + }, + 'playlist_mincount': 22, + }] + + def _real_extract(self, url): + _, show_id, season_id = re.match(self._VALID_URL, url).groups() + return self._extract_items( + url, show_id, season_id, {'season': season_id}) + + +class TVNowAnnualIE(TVNowListBaseIE): + _VALID_URL = r'%s/(?P<year>\d{4})-(?P<month>\d{2})' % TVNowListBaseIE._SHOW_VALID_URL + _TESTS = [{ + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05', + 'info_dict': { + 'id': '1669/2017-05', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + _, show_id, year, month = re.match(self._VALID_URL, url).groups() + return self._extract_items( + url, show_id, '%s-%s' % (year, month), { + 'year': int(year), + 'month': int(month), + }) class TVNowShowIE(TVNowListBaseIE): _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL - - _SHOW_FIELDS = ('id', 'title', ) - _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) - _VIDEO_FIELDS = () - _TESTS = [{ - 'url': 'https://www.tvnow.at/vox/ab-ins-beet', + # annual navigationType + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669', 'info_dict': { - 'id': 'ab-ins-beet', - 'title': 'Ab ins Beet!', + 'id': '1669', }, - 'playlist_mincount': 7, + 'playlist_mincount': 73, }, { - 'url': 'https://www.tvnow.at/vox/ab-ins-beet/list', - 'only_matching': True, - }, { - 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/', - 'only_matching': True, + # season navigationType + 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471', + 'info_dict': { + 'id': '11471', + }, + 'playlist_mincount': 3, }] @classmethod def suitable(cls, url): - return (False if TVNowIE.suitable(url) or TVNowListIE.suitable(url) + return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) else super(TVNowShowIE, cls).suitable(url)) def _real_extract(self, url): base_url, show_id = re.match(self._VALID_URL, url).groups() - list_info = self._extract_list_info(show_id, show_id) + result = self._call_api( + 'teaserrow/format/navigation/' + show_id, show_id) + + items = result['items'] entries = [] - for season_info in list_info['formatTabs']['items']: - season_url = season_info.get('seoheadline') - if not season_url: - continue - season_id = season_info.get('id') - entries.append(self.url_result( - '%s/list/%s' % (base_url, season_url), TVNowListIE.ie_key(), - compat_str(season_id) if season_id else None, - season_info.get('headline'))) + navigation = result.get('navigationType') + if navigation == 'annual': + for item in items: + if not isinstance(item, dict): + continue + year = int_or_none(item.get('year')) + if year is None: + continue + months = item.get('months') + if not isinstance(months, list): + continue + for month_dict in months: + if not isinstance(month_dict, dict) or not month_dict: + continue + month_number = int_or_none(list(month_dict.keys())[0]) + if month_number is None: + continue + entries.append(self.url_result( + '%s/%04d-%02d' % (base_url, year, month_number), + ie=TVNowAnnualIE.ie_key())) + elif navigation == 'season': + for item in items: + if not isinstance(item, dict): + continue + season_number = int_or_none(item.get('season')) + if season_number is None: + continue + entries.append(self.url_result( + '%s/staffel-%d' % (base_url, season_number), + ie=TVNowSeasonIE.ie_key())) + else: + raise ExtractorError('Unknown navigationType') - return self.playlist_result(entries, show_id, list_info.get('title')) + return self.playlist_result(entries, show_id) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 3954f0b93..accff75b5 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,14 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor from ..utils import ( - determine_ext, clean_html, - get_element_by_attribute, + determine_ext, ExtractorError, + get_element_by_attribute, + orderedSet, ) @@ -19,12 +21,12 @@ class TVPIE(InfoExtractor): _TESTS = [{ 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', - 'md5': '8aa518c15e5cc32dfe8db400dc921fbb', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', - 'title': 'Czas honoru, I seria – odc. 13', - 'description': 'md5:381afa5bca72655fe94b05cfe82bf53d', + 'title': 'Czas honoru, odc. 13 – Władek', + 'description': 'md5:437f48b93558370b031740546b696e24', }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', @@ -45,6 +47,7 @@ class TVPIE(InfoExtractor): 'title': 'Wiadomości, 28.09.2017, 19:30', 'description': 'Wydanie główne codziennego serwisu informacyjnego.' }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 'only_matching': True, @@ -75,8 +78,10 @@ class TVPIE(InfoExtractor): return { '_type': 'url_transparent', 'url': 'tvp:' + video_id, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'ie_key': 'TVPEmbed', } @@ -87,6 +92,15 @@ class TVPEmbedIE(InfoExtractor): _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' _TESTS = [{ + 'url': 'tvp:194536', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', + 'info_dict': { + 'id': '194536', + 'ext': 'mp4', + 'title': 'Czas honoru, odc. 13 – Władek', + }, + }, { + # not available 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', 'md5': '8c9cd59d16edabf39331f93bf8a766c7', 'info_dict': { @@ -94,6 +108,7 @@ class TVPEmbedIE(InfoExtractor): 'ext': 'mp4', 'title': 'Panorama, 07.12.2015, 15:40', }, + 'skip': 'Transmisja została zakończona lub materiał niedostępny', }, { 'url': 'tvp:22670268', 'only_matching': True, @@ -105,10 +120,13 @@ class TVPEmbedIE(InfoExtractor): webpage = self._download_webpage( 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) - error_massage = get_element_by_attribute('class', 'msg error', webpage) - if error_massage: + error = self._html_search_regex( + r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', + webpage, 'error', default=None) or clean_html( + get_element_by_attribute('class', 'msg error', webpage)) + if error: raise ExtractorError('%s said: %s' % ( - self.IE_NAME, clean_html(error_massage)), expected=True) + self.IE_NAME, clean_html(error)), expected=True) title = self._search_regex( r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', @@ -180,48 +198,55 @@ class TVPEmbedIE(InfoExtractor): } -class TVPSeriesIE(InfoExtractor): +class TVPWebsiteIE(InfoExtractor): IE_NAME = 'tvp:series' - _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' _TESTS = [{ - 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem', + # series + 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', 'info_dict': { - 'title': 'Ogniem i mieczem', - 'id': '4278026', + 'id': '38678312', }, - 'playlist_count': 4, + 'playlist_count': 115, }, { - 'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat', + # film + 'url': 'https://vod.tvp.pl/website/gloria,35139666', 'info_dict': { - 'title': 'Boso przez świat', - 'id': '9329207', + 'id': '36637049', + 'ext': 'mp4', + 'title': 'Gloria, Gloria', }, - 'playlist_count': 86, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['TVPEmbed'], + }, { + 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', + 'only_matching': True, }] + def _entries(self, display_id, playlist_id): + url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) + for page_num in itertools.count(1): + page = self._download_webpage( + url, display_id, 'Downloading page %d' % page_num, + query={'page': page_num}) + + video_ids = orderedSet(re.findall( + r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id, + page)) + + if not video_ids: + break + + for video_id in video_ids: + yield self.url_result( + 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(), + video_id=video_id) + def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id, tries=5) - - title = self._html_search_regex( - r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series') - playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id') - playlist = self._download_webpage( - 'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend' - 'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5, - note='Downloading playlist') - - videos_paths = re.findall( - '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist) - entries = [ - self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key()) - for v_path in videos_paths] - - return { - '_type': 'playlist', - 'id': playlist_id, - 'display_id': display_id, - 'title': title, - 'entries': entries, - } + mobj = re.match(self._VALID_URL, url) + display_id, playlist_id = mobj.group('display_id', 'id') + return self.playlist_result( + self._entries(display_id, playlist_id), playlist_id) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index e09b5f804..d82d48f94 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -19,6 +19,7 @@ from ..utils import ( try_get, unsmuggle_url, update_url_query, + url_or_none, ) @@ -31,12 +32,12 @@ class TVPlayIE(InfoExtractor): https?:// (?:www\.)? (?: - tvplay(?:\.skaties)?\.lv/parraides| - (?:tv3play|play\.tv3)\.lt/programos| + tvplay(?:\.skaties)?\.lv(?:/parraides)?| + (?:tv3play|play\.tv3)\.lt(?:/programos)?| tv3play(?:\.tv3)?\.ee/sisu| (?:tv(?:3|6|8|10)play|viafree)\.se/program| (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer| - play\.novatv\.bg/programi + play\.nova(?:tv)?\.bg/programi ) /(?:[^/]+/)+ ) @@ -202,10 +203,18 @@ class TVPlayIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true', + 'only_matching': True, + }, { 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', 'only_matching': True, }, + { + 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/418113/?autostart=true', + 'only_matching': True, + }, { # views is null 'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183', @@ -255,7 +264,8 @@ class TVPlayIE(InfoExtractor): quality = qualities(['hls', 'medium', 'high']) formats = [] for format_id, video_url in streams.get('streams', {}).items(): - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(video_url) + if not video_url: continue ext = determine_ext(video_url) if ext == 'f4m': @@ -286,6 +296,7 @@ class TVPlayIE(InfoExtractor): 'url': m.group('url'), 'app': m.group('app'), 'play_path': m.group('playpath'), + 'preference': -1, }) else: fmt.update({ @@ -445,3 +456,102 @@ class ViafreeIE(InfoExtractor): 'skip_rtmp': True, }), ie=TVPlayIE.ie_key(), video_id=video_id) + + +class TVPlayHomeIE(InfoExtractor): + _VALID_URL = r'https?://tvplay\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/[^/]+/[^/?#&]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', + 'info_dict': { + 'id': '366367', + 'ext': 'mp4', + 'title': 'Aferistai', + 'description': 'Aferistai. Kalėdinė pasaka.', + 'series': 'Aferistai [N-7]', + 'season': '1 sezonas', + 'season_number': 1, + 'duration': 464, + 'timestamp': 1394209658, + 'upload_date': '20140307', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', + 'only_matching': True, + }, { + 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id') + + if len(video_id) < 8: + return self.url_result( + 'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id) + + m3u8_url = self._search_regex( + r'data-file\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + title = self._search_regex( + r'data-title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'title', default=None, group='value') or self._html_search_meta( + 'title', webpage, default=None) or self._og_search_title( + webpage) + + description = self._html_search_meta( + 'description', webpage, + default=None) or self._og_search_description(webpage) + + thumbnail = self._search_regex( + r'data-image\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'thumbnail', default=None, group='url') or self._html_search_meta( + 'thumbnail', webpage, default=None) or self._og_search_thumbnail( + webpage) + + duration = int_or_none(self._search_regex( + r'data-duration\s*=\s*["\'](\d+)', webpage, 'duration', + fatal=False)) + + season = self._search_regex( + (r'data-series-title\s*=\s*(["\'])[^/]+/(?P<value>(?:(?!\1).)+)\1', + r'\bseason\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'season', default=None, group='value') + season_number = int_or_none(self._search_regex( + r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number', + default=None)) + episode = self._search_regex( + (r'\bepisode\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + r'data-subtitle\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'episode', default=None, group='value') + episode_number = int_or_none(self._search_regex( + r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number', + default=None)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'formats': formats, + } diff --git a/youtube_dl/extractor/twitcasting.py b/youtube_dl/extractor/twitcasting.py new file mode 100644 index 000000000..05f8aa9ce --- /dev/null +++ b/youtube_dl/extractor/twitcasting.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re + + +class TwitCastingIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)' + _TEST = { + 'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609', + 'md5': '745243cad58c4681dc752490f7540d7f', + 'info_dict': { + 'id': '2357609', + 'ext': 'mp4', + 'title': 'Recorded Live #2357609', + 'uploader_id': 'ivetesangalo', + 'description': "Moi! I'm live on TwitCasting from my iPhone.", + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + uploader_id = mobj.group('uploader_id') + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</', + webpage, 'title', default=None) or self._html_search_meta( + 'twitter:title', webpage, fatal=True) + + m3u8_url = self._search_regex( + (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'), + webpage, 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader_id': uploader_id, + 'formats': formats, + } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index e01f11331..8c87f6dd3 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import itertools import re import random +import json from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_kwargs, compat_parse_qs, compat_str, @@ -26,7 +26,7 @@ from ..utils import ( try_get, unified_timestamp, update_url_query, - urlencode_postdata, + url_or_none, urljoin, ) @@ -36,8 +36,9 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' - _LOGIN_URL = 'https://www.twitch.tv/login' - _CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6' + _LOGIN_FORM_URL = 'https://www.twitch.tv/login' + _LOGIN_POST_URL = 'https://passport.twitch.tv/login' + _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' _NETRC_MACHINE = 'twitch' def _handle_error(self, response): @@ -50,7 +51,9 @@ class TwitchBaseIE(InfoExtractor): expected=True) def _call_api(self, path, item_id, *args, **kwargs): - kwargs.setdefault('headers', {})['Client-ID'] = self._CLIENT_ID + headers = kwargs.get('headers', {}).copy() + headers['Client-ID'] = self._CLIENT_ID + kwargs['headers'] = headers response = self._download_json( '%s/%s' % (self._API_BASE, path), item_id, *args, **compat_kwargs(kwargs)) @@ -76,22 +79,21 @@ class TwitchBaseIE(InfoExtractor): page_url = urlh.geturl() post_url = self._search_regex( r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, - 'post url', default=page_url, group='url') + 'post url', default=self._LOGIN_POST_URL, group='url') post_url = urljoin(page_url, post_url) - headers = {'Referer': page_url} + headers = { + 'Referer': page_url, + 'Origin': page_url, + 'Content-Type': 'text/plain;charset=UTF-8', + } - try: - response = self._download_json( - post_url, None, note, - data=urlencode_postdata(form), - headers=headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - response = self._parse_json( - e.cause.read().decode('utf-8'), None) - fail(response.get('message') or response['errors'][0]) - raise + response = self._download_json( + post_url, None, note, data=json.dumps(form).encode(), + headers=headers, expected_status=400) + error = response.get('error_description') or response.get('error_code') + if error: + fail(error) if 'Authenticated successfully' in response.get('message', ''): return None, None @@ -104,7 +106,7 @@ class TwitchBaseIE(InfoExtractor): headers=headers) login_page, handle = self._download_webpage_handle( - self._LOGIN_URL, None, 'Downloading login page') + self._LOGIN_FORM_URL, None, 'Downloading login page') # Some TOR nodes and public proxies are blocked completely if 'blacklist_message' in login_page: @@ -114,6 +116,7 @@ class TwitchBaseIE(InfoExtractor): login_page, handle, 'Logging in', { 'username': username, 'password': password, + 'client_id': self._CLIENT_ID, }) # Successful login @@ -133,7 +136,12 @@ class TwitchBaseIE(InfoExtractor): source = next(f for f in formats if f['format_id'] == 'Source') source['preference'] = 10 except StopIteration: - pass # No Source stream present + for f in formats: + if '/chunked/' in f['url']: + f.update({ + 'source_preference': 10, + 'format_note': 'Source', + }) self._sort_formats(formats) @@ -239,7 +247,7 @@ class TwitchVodIE(TwitchItemBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v|videos)/| + (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| player\.twitch\.tv/\?.*?\bvideo=v ) (?P<id>\d+) @@ -295,6 +303,9 @@ class TwitchVodIE(TwitchItemBaseIE): }, { 'url': 'https://m.twitch.tv/beagsandjam/v/247478721', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/northernlion/video/291940395', + 'only_matching': True, }] def _real_extract(self, url): @@ -555,7 +566,8 @@ class TwitchStreamIE(TwitchBaseIE): TwitchAllVideosIE, TwitchUploadsIE, TwitchPastBroadcastsIE, - TwitchHighlightsIE)) + TwitchHighlightsIE, + TwitchClipsIE)) else super(TwitchStreamIE, cls).suitable(url)) def _real_extract(self, url): @@ -629,7 +641,7 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:[^/]+/)*|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -649,6 +661,9 @@ class TwitchClipsIE(TwitchBaseIE): # multiple formats 'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan', + 'only_matching': True, }] def _real_extract(self, url): @@ -663,8 +678,8 @@ class TwitchClipsIE(TwitchBaseIE): for option in status['quality_options']: if not isinstance(option, dict): continue - source = option.get('source') - if not source or not isinstance(source, compat_str): + source = url_or_none(option.get('source')) + if not source: continue formats.append({ 'url': source, diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index de41065d6..41d0b6be8 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -171,7 +171,8 @@ class TwitterCardIE(TwitterBaseIE): urls.append('https://twitter.com/i/videos/' + video_id) for u in urls: - webpage = self._download_webpage(u, video_id) + webpage = self._download_webpage( + u, video_id, headers={'Referer': 'https://twitter.com/'}) iframe_url = self._html_search_regex( r'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index a7196997e..ae8de9897 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -20,6 +20,7 @@ from ..utils import ( sanitized_Request, try_get, unescapeHTML, + url_or_none, urlencode_postdata, ) @@ -28,7 +29,7 @@ class UdemyIE(InfoExtractor): IE_NAME = 'udemy' _VALID_URL = r'''(?x) https?:// - www\.udemy\.com/ + (?:[^/]+\.)?udemy\.com/ (?: [^#]+\#/lecture/| lecture/view/?\?lectureId=| @@ -63,6 +64,9 @@ class UdemyIE(InfoExtractor): # only outputs rendition 'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0', 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757', + 'only_matching': True, }] def _extract_course_info(self, webpage, video_id): @@ -121,9 +125,23 @@ class UdemyIE(InfoExtractor): raise ExtractorError(error_str, expected=True) def _download_webpage_handle(self, *args, **kwargs): - kwargs.setdefault('headers', {})['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4' - return super(UdemyIE, self)._download_webpage_handle( + headers = kwargs.get('headers', {}).copy() + headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' + kwargs['headers'] = headers + ret = super(UdemyIE, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) + if not ret: + return ret + webpage, _ = ret + if any(p in webpage for p in ( + '>Please verify you are a human', + 'Access to this page has been denied because we believe you are using automation tools to browse the website', + '"_pxCaptcha"')): + raise ExtractorError( + 'Udemy asks you to solve a CAPTCHA. Login with browser, ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'youtube-dl with --cookies.', expected=True) + return ret def _download_json(self, url_or_request, *args, **kwargs): headers = { @@ -265,8 +283,8 @@ class UdemyIE(InfoExtractor): if not isinstance(source_list, list): return for source in source_list: - video_url = source.get('file') or source.get('src') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(source.get('file') or source.get('src')) + if not video_url: continue if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -293,8 +311,8 @@ class UdemyIE(InfoExtractor): continue if track.get('kind') != 'captions': continue - src = track.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(track.get('src')) + if not src: continue lang = track.get('language') or track.get( 'srclang') or track.get('label') @@ -314,8 +332,8 @@ class UdemyIE(InfoExtractor): for cc in captions: if not isinstance(cc, dict): continue - cc_url = cc.get('url') - if not cc_url or not isinstance(cc_url, compat_str): + cc_url = url_or_none(cc.get('url')) + if not cc_url: continue lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) sub_dict = (automatic_captions if cc.get('source') == 'auto' @@ -400,8 +418,14 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://(?:www\.)?udemy\.com/(?P<id>[^/?#&]+)' - _TESTS = [] + _VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.udemy.com/java-tutorial/', + 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/', + 'only_matching': True, + }] @classmethod def suitable(cls, url): diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py index e67083004..08f0c072e 100644 --- a/youtube_dl/extractor/uol.py +++ b/youtube_dl/extractor/uol.py @@ -61,7 +61,7 @@ class UOLIE(InfoExtractor): 'height': 360, }, '5': { - 'width': 1080, + 'width': 1280, 'height': 720, }, '6': { @@ -80,6 +80,10 @@ class UOLIE(InfoExtractor): 'width': 568, 'height': 320, }, + '11': { + 'width': 640, + 'height': 360, + } } def _real_extract(self, url): @@ -111,19 +115,31 @@ class UOLIE(InfoExtractor): 'ver': video_data.get('numRevision', 2), 'r': 'http://mais.uol.com.br', } + for k in ('token', 'sign'): + v = video_data.get(k) + if v: + query[k] = v + formats = [] for f in video_data.get('formats', []): f_url = f.get('url') or f.get('secureUrl') if not f_url: continue + f_url = update_url_query(f_url, query) format_id = str_or_none(f.get('id')) + if format_id == '10': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + continue fmt = { 'format_id': format_id, - 'url': update_url_query(f_url, query), + 'url': f_url, + 'source_preference': 1, } fmt.update(self._FORMATS.get(format_id, {})) formats.append(fmt) - self._sort_formats(formats) + self._sort_formats(formats, ('height', 'width', 'source_preference', 'tbr', 'ext')) tags = [] for tag in video_data.get('tags', []): diff --git a/youtube_dl/extractor/usatoday.py b/youtube_dl/extractor/usatoday.py index e5678dc78..b2103448d 100644 --- a/youtube_dl/extractor/usatoday.py +++ b/youtube_dl/extractor/usatoday.py @@ -3,21 +3,23 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, get_element_by_attribute, parse_duration, + try_get, update_url_query, - ExtractorError, ) from ..compat import compat_str class USATodayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?usatoday\.com/(?:[^/]+/)*(?P<id>[^?/#]+)' - _TEST = { + _TESTS = [{ + # Brightcove Partner ID = 29906170001 'url': 'http://www.usatoday.com/media/cinematic/video/81729424/us-france-warn-syrian-regime-ahead-of-new-peace-talks/', - 'md5': '4d40974481fa3475f8bccfd20c5361f8', + 'md5': '033587d2529dc3411a1ab3644c3b8827', 'info_dict': { - 'id': '81729424', + 'id': '4799374959001', 'ext': 'mp4', 'title': 'US, France warn Syrian regime ahead of new peace talks', 'timestamp': 1457891045, @@ -25,8 +27,20 @@ class USATodayIE(InfoExtractor): 'uploader_id': '29906170001', 'upload_date': '20160313', } - } - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/29906170001/38a9eecc-bdd8-42a3-ba14-95397e48b3f8_default/index.html?videoId=%s' + }, { + # ui-video-data[asset_metadata][items][brightcoveaccount] = 28911775001 + 'url': 'https://www.usatoday.com/story/tech/science/2018/08/21/yellowstone-supervolcano-eruption-stop-worrying-its-blow/973633002/', + 'info_dict': { + 'id': '5824495846001', + 'ext': 'mp4', + 'title': 'Yellowstone more likely to crack rather than explode', + 'timestamp': 1534790612, + 'description': 'md5:3715e7927639a4f16b474e9391687c62', + 'uploader_id': '28911775001', + 'upload_date': '20180820', + } + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' def _real_extract(self, url): display_id = self._match_id(url) @@ -35,10 +49,11 @@ class USATodayIE(InfoExtractor): if not ui_video_data: raise ExtractorError('no video on the webpage', expected=True) video_data = self._parse_json(ui_video_data, display_id) + item = try_get(video_data, lambda x: x['asset_metadata']['items'], dict) or {} return { '_type': 'url_transparent', - 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_data['brightcove_id'], + 'url': self.BRIGHTCOVE_URL_TEMPLATE % (item.get('brightcoveaccount', '29906170001'), item.get('brightcoveid') or video_data['brightcove_id']), 'id': compat_str(video_data['id']), 'title': video_data['title'], 'thumbnail': video_data.get('thumbnail'), diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index c21a09c01..fe7a26b62 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -24,6 +24,7 @@ class VGTVIE(XstreamIE): 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', 'tv.aftonbladet.se/abtv': 'abtv', + 'www.aftonbladet.se/tv': 'abtv', } _APP_NAME_TO_VENDOR = { @@ -44,7 +45,7 @@ class VGTVIE(XstreamIE): (?: (?:\#!/)?(?:video|live)/| embed?.*id=| - articles/ + a(?:rticles)?/ )| (?P<appname> %s @@ -143,6 +144,10 @@ class VGTVIE(XstreamIE): 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, }, + { + 'url': 'https://www.aftonbladet.se/tv/a/36015', + 'only_matching': True, + }, { 'url': 'abtv:140026', 'only_matching': True, @@ -178,13 +183,15 @@ class VGTVIE(XstreamIE): streams = data['streamUrls'] stream_type = data.get('streamType') - + is_live = stream_type == 'live' formats = [] hls_url = streams.get('hls') if hls_url: formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + hls_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) hds_url = streams.get('hds') if hds_url: @@ -229,13 +236,13 @@ class VGTVIE(XstreamIE): info.update({ 'id': video_id, - 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], + 'title': self._live_title(data['title']) if is_live else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'is_live': True if stream_type == 'live' else False, + 'is_live': is_live, }) return info diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 538258617..8fdfd743d 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -94,7 +94,6 @@ class ViceIE(AdobePassIE): 'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1', 'only_matching': True, }] - _PREPLAY_HOST = 'vms.vice' @staticmethod def _extract_urls(webpage): @@ -158,9 +157,8 @@ class ViceIE(AdobePassIE): }) try: - host = 'www.viceland' if is_locked else self._PREPLAY_HOST preplay = self._download_json( - 'https://%s.com/%s/video/preplay/%s' % (host, locale, video_id), + 'https://vms.vice.com/%s/video/preplay/%s' % (locale, video_id), video_id, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401): diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py index 9b56630de..e3eda3327 100644 --- a/youtube_dl/extractor/videomore.py +++ b/youtube_dl/extractor/videomore.py @@ -4,8 +4,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, + orderedSet, + parse_duration, + str_or_none, + unified_strdate, + url_or_none, xpath_element, xpath_text, ) @@ -13,7 +19,19 @@ from ..utils import ( class VideomoreIE(InfoExtractor): IE_NAME = 'videomore' - _VALID_URL = r'videomore:(?P<sid>\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P<id>\d+)(?:[/?#&]|\.(?:xml|json)|$)' + _VALID_URL = r'''(?x) + videomore:(?P<sid>\d+)$| + https?://(?:player\.)?videomore\.ru/ + (?: + (?: + embed| + [^/]+/[^/]+ + )/| + [^/]*\?.*?\btrack_id= + ) + (?P<id>\d+) + (?:[/?#&]|\.(?:xml|json)|$) + ''' _TESTS = [{ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', 'md5': '44455a346edc0d509ac5b5a5b531dc35', @@ -79,6 +97,9 @@ class VideomoreIE(InfoExtractor): }, { 'url': 'videomore:367617', 'only_matching': True, + }, { + 'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=', + 'only_matching': True, }] @staticmethod @@ -136,7 +157,7 @@ class VideomoreIE(InfoExtractor): class VideomoreVideoIE(InfoExtractor): IE_NAME = 'videomore:video' - _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)[/?#&]*$' + _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ # single video with og:video:iframe 'url': 'http://videomore.ru/elki_3', @@ -176,6 +197,9 @@ class VideomoreVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so', + 'only_matching': True, }] @classmethod @@ -196,13 +220,16 @@ class VideomoreVideoIE(InfoExtractor): r'track-id=["\'](\d+)', r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id') video_url = 'videomore:%s' % video_id + else: + video_id = None - return self.url_result(video_url, VideomoreIE.ie_key()) + return self.url_result( + video_url, ie=VideomoreIE.ie_key(), video_id=video_id) class VideomoreSeasonIE(InfoExtractor): IE_NAME = 'videomore:season' - _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)[/?#&]*$' + _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ 'url': 'http://videomore.ru/molodezhka/sezon_promo', 'info_dict': { @@ -210,8 +237,16 @@ class VideomoreSeasonIE(InfoExtractor): 'title': 'Молодежка Промо', }, 'playlist_mincount': 12, + }, { + 'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if (VideomoreIE.suitable(url) or VideomoreVideoIE.suitable(url)) + else super(VideomoreSeasonIE, cls).suitable(url)) + def _real_extract(self, url): display_id = self._match_id(url) @@ -219,9 +254,54 @@ class VideomoreSeasonIE(InfoExtractor): title = self._og_search_title(webpage) - entries = [ - self.url_result(item) for item in re.findall( - r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' - % display_id, webpage)] + data = self._parse_json( + self._html_search_regex( + r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P<value>{.+?})\1', + webpage, 'data', default='{}', group='value'), + display_id, fatal=False) + + entries = [] + + if data: + episodes = data.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + if not isinstance(ep, dict): + continue + ep_id = int_or_none(ep.get('id')) + ep_url = url_or_none(ep.get('url')) + if ep_id: + e = { + 'url': 'videomore:%s' % ep_id, + 'id': compat_str(ep_id), + } + elif ep_url: + e = {'url': ep_url} + else: + continue + e.update({ + '_type': 'url', + 'ie_key': VideomoreIE.ie_key(), + 'title': str_or_none(ep.get('title')), + 'thumbnail': url_or_none(ep.get('image')), + 'duration': parse_duration(ep.get('duration')), + 'episode_number': int_or_none(ep.get('number')), + 'upload_date': unified_strdate(ep.get('date')), + }) + entries.append(e) + + if not entries: + entries = [ + self.url_result( + 'videomore:%s' % video_id, ie=VideomoreIE.ie_key(), + video_id=video_id) + for video_id in orderedSet(re.findall( + r':(?:id|key)=["\'](\d+)["\']', webpage))] + + if not entries: + entries = [ + self.url_result(item) for item in re.findall( + r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' + % display_id, webpage)] return self.playlist_result(entries, display_id, title) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 59adb2377..174e69cd6 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -3,15 +3,13 @@ from __future__ import unicode_literals import itertools from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, float_or_none, parse_iso8601, + url_or_none, ) @@ -166,8 +164,8 @@ class VidmeIE(InfoExtractor): formats = [] for f in video.get('formats', []): - format_url = f.get('uri') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(f.get('uri')) + if not format_url: continue format_type = f.get('type') if format_type == 'dash': diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 9026e778c..42ea4952c 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -13,7 +13,7 @@ from ..utils import ( class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' _TESTS = [{ 'url': 'http://vidzi.tv/cghql9yq6emu.html', 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', @@ -35,6 +35,9 @@ class VidziIE(InfoExtractor): }, { 'url': 'https://vidzi.si/rph9gztxj1et.html', 'only_matching': True, + }, { + 'url': 'http://vidzi.nu/cghql9yq6emu.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -54,7 +57,8 @@ class VidziIE(InfoExtractor): self._search_regex( r'setup\(([^)]+)\)', code, 'jwplayer data', default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=js_to_json) + video_id, transform_source=lambda s: js_to_json( + re.sub(r'\s*\+\s*window\[.+?\]', '', s))) if jwplayer_data: break diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index d5d5b4c69..6e318479c 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -130,16 +130,16 @@ class ViewsterIE(InfoExtractor): def concat(suffix, sep='-'): return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix - for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): - media = self._download_json( - 'https://public-api.viewster.com/movies/%s/video' % entry_id, - video_id, 'Downloading %s JSON' % concat(media_type, ' '), fatal=False, query={ - 'mediaType': media_type, - 'language': audio, - 'subtitle': subtitle, - }) - if not media: - continue + medias = self._download_json( + 'https://public-api.viewster.com/movies/%s/videos' % entry_id, + video_id, fatal=False, query={ + 'mediaTypes': ['application/f4m+xml', 'application/x-mpegURL', 'video/mp4'], + 'language': audio, + 'subtitle': subtitle, + }) + if not medias: + continue + for media in medias: video_url = media.get('Uri') if not video_url: continue diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 3baa2d075..6215b3258 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import json import re import itertools @@ -14,10 +15,13 @@ from ..compat import ( from ..utils import ( determine_ext, ExtractorError, + js_to_json, InAdvancePagedList, int_or_none, merge_dicts, NO_DEFAULT, + parse_filesize, + qualities, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -27,7 +31,6 @@ from ..utils import ( unsmuggle_url, urlencode_postdata, unescapeHTML, - parse_filesize, ) @@ -299,10 +302,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', + 'channel_id': 'keypeele', + 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/keypeele', 'timestamp': 1380339469, 'upload_date': '20130928', 'duration': 187, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://vimeo.com/76979871', @@ -355,11 +361,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/channels/tributes/6213729', 'info_dict': { 'id': '6213729', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Vimeo Tribute: The Shining', 'uploader': 'Casey Donahue', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue', 'uploader_id': 'caseydonahue', + 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/tributes', + 'channel_id': 'tributes', 'timestamp': 1250886430, 'upload_date': '20090821', 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', @@ -385,6 +393,22 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://player.vimeo.com/video/68375962', + 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', + 'info_dict': { + 'id': '68375962', + 'ext': 'mp4', + 'title': 'youtube-dl password protected test video', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', + 'uploader_id': 'user18948128', + 'uploader': 'Jaime Marquínez Ferrándiz', + 'duration': 10, + }, + 'params': { + 'videopassword': 'youtube-dl', + }, + }, { 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', 'only_matching': True, @@ -411,6 +435,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/160743502/abd0e13fb4', 'only_matching': True, } + # https://gettingthingsdone.com/workflowmap/ + # vimeo embed with check-password page protected by Referer header ] @staticmethod @@ -441,18 +467,22 @@ class VimeoIE(VimeoBaseInfoExtractor): urls = VimeoIE._extract_urls(url, webpage) return urls[0] if urls else None - def _verify_player_video_password(self, url, video_id): + def _verify_player_video_password(self, url, video_id, headers): password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') - data = urlencode_postdata({'password': password}) - pass_url = url + '/check-password' - password_request = sanitized_Request(pass_url, data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) - return self._download_json( - password_request, video_id, - 'Verifying the password', 'Wrong password') + data = urlencode_postdata({ + 'password': base64.b64encode(password.encode()), + }) + headers = merge_dicts(headers, { + 'Content-Type': 'application/x-www-form-urlencoded', + }) + checked = self._download_json( + url + '/check-password', video_id, + 'Verifying the password', data=data, headers=headers) + if checked is False: + raise ExtractorError('Wrong video password', expected=True) + return checked def _real_initialize(self): self._login() @@ -465,6 +495,9 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + # Extract ID from URL mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -539,9 +572,11 @@ class VimeoIE(VimeoBaseInfoExtractor): # We try to find out to which variable is assigned the config dic m_variable_name = re.search(r'(\w)\.video\.id', webpage) if m_variable_name is not None: - config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1)) + config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] else: config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] + config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') + config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') config = self._search_regex(config_re, webpage, 'info section', flags=re.DOTALL) config = json.loads(config) @@ -560,21 +595,25 @@ class VimeoIE(VimeoBaseInfoExtractor): cause=e) else: if config.get('view') == 4: - config = self._verify_player_video_password(redirect_url, video_id) + config = self._verify_player_video_password(redirect_url, video_id, headers) + + vod = config.get('video', {}).get('vod', {}) def is_rented(): if '>You rented this title.<' in webpage: return True if config.get('user', {}).get('purchased'): return True - label = try_get( - config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str) - if label and label.startswith('You rented this'): - return True + for purchase_option in vod.get('purchase_options', []): + if purchase_option.get('purchased'): + return True + label = purchase_option.get('label_string') + if label and (label.startswith('You rented this') or label.endswith(' remaining')): + return True return False - if is_rented(): - feature_id = config.get('video', {}).get('vod', {}).get('feature_id') + if is_rented() and vod.get('is_trailer'): + feature_id = vod.get('feature_id') if feature_id and not data.get('force_feature_id', False): return self.url_result(smuggle_url( 'https://player.vimeo.com/player/%s' % feature_id, @@ -651,6 +690,8 @@ class VimeoIE(VimeoBaseInfoExtractor): r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') + channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None + info_dict = { 'id': video_id, 'formats': formats, @@ -661,6 +702,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': like_count, 'comment_count': comment_count, 'license': cc_license, + 'channel_id': channel_id, + 'channel_url': channel_url, } info_dict = merge_dicts(info_dict, info_dict_config, json_ld) @@ -1045,3 +1088,96 @@ class VimeoLikesIE(InfoExtractor): 'description': description, 'entries': pl, } + + +class VHXEmbedIE(InfoExtractor): + IE_NAME = 'vhx:embed' + _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)' + + def _call_api(self, video_id, access_token, path='', query=None): + return self._download_json( + 'https://api.vhx.tv/videos/' + video_id + path, video_id, headers={ + 'Authorization': 'Bearer ' + access_token, + }, query=query) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + credentials = self._parse_json(self._search_regex( + r'(?s)credentials\s*:\s*({.+?}),', webpage, + 'config'), video_id, js_to_json) + access_token = credentials['access_token'] + + query = {} + for k, v in credentials.items(): + if k in ('authorization', 'authUserToken', 'ticket') and v and v != 'undefined': + if k == 'authUserToken': + query['auth_user_token'] = v + else: + query[k] = v + files = self._call_api(video_id, access_token, '/files', query) + + formats = [] + for f in files: + href = try_get(f, lambda x: x['_links']['source']['href']) + if not href: + continue + method = f.get('method') + if method == 'hls': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif method == 'dash': + formats.extend(self._extract_mpd_formats( + href, video_id, mpd_id='dash', fatal=False)) + else: + fmt = { + 'filesize': int_or_none(try_get(f, lambda x: x['size']['bytes'])), + 'format_id': 'http', + 'preference': 1, + 'url': href, + 'vcodec': f.get('codec'), + } + quality = f.get('quality') + if quality: + fmt.update({ + 'format_id': 'http-' + quality, + 'height': int_or_none(self._search_regex(r'(\d+)p', quality, 'height', default=None)), + }) + formats.append(fmt) + self._sort_formats(formats) + + video_data = self._call_api(video_id, access_token) + title = video_data.get('title') or video_data['name'] + + subtitles = {} + for subtitle in try_get(video_data, lambda x: x['tracks']['subtitles'], list) or []: + lang = subtitle.get('srclang') or subtitle.get('label') + for _link in subtitle.get('_links', {}).values(): + href = _link.get('href') + if not href: + continue + subtitles.setdefault(lang, []).append({ + 'url': href, + }) + + q = qualities(['small', 'medium', 'large', 'source']) + thumbnails = [] + for thumbnail_id, thumbnail_url in video_data.get('thumbnail', {}).items(): + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'preference': q(thumbnail_id), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': int_or_none(try_get(video_data, lambda x: x['duration']['seconds'])), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(video_data.get('created_at')), + 'view_count': int_or_none(video_data.get('plays_count')), + } diff --git a/youtube_dl/extractor/viqeo.py b/youtube_dl/extractor/viqeo.py new file mode 100644 index 000000000..be7dfa814 --- /dev/null +++ b/youtube_dl/extractor/viqeo.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + str_or_none, + url_or_none, +) + + +class ViqeoIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + viqeo:| + https?://cdn\.viqeo\.tv/embed/*\?.*?\bvid=| + https?://api\.viqeo\.tv/v\d+/data/startup?.*?\bvideo(?:%5B%5D|\[\])= + ) + (?P<id>[\da-f]+) + ''' + _TESTS = [{ + 'url': 'https://cdn.viqeo.tv/embed/?vid=cde96f09d25f39bee837', + 'md5': 'a169dd1a6426b350dca4296226f21e76', + 'info_dict': { + 'id': 'cde96f09d25f39bee837', + 'ext': 'mp4', + 'title': 'cde96f09d25f39bee837', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 76, + }, + }, { + 'url': 'viqeo:cde96f09d25f39bee837', + 'only_matching': True, + }, { + 'url': 'https://api.viqeo.tv/v1/data/startup?video%5B%5D=71bbec412ade45c3216c&profile=112', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://cdn.viqeo.tv/embed/?vid=%s' % video_id, video_id) + + data = self._parse_json( + self._search_regex( + r'SLOT_DATA\s*=\s*({.+?})\s*;', webpage, 'slot data'), + video_id) + + formats = [] + thumbnails = [] + for media_file in data['mediaFiles']: + if not isinstance(media_file, dict): + continue + media_url = url_or_none(media_file.get('url')) + if not media_url or not media_url.startswith(('http', '//')): + continue + media_type = str_or_none(media_file.get('type')) + if not media_type: + continue + media_kind = media_type.split('/')[0].lower() + f = { + 'url': media_url, + 'width': int_or_none(media_file.get('width')), + 'height': int_or_none(media_file.get('height')), + } + format_id = str_or_none(media_file.get('quality')) + if media_kind == 'image': + f['id'] = format_id + thumbnails.append(f) + elif media_kind in ('video', 'audio'): + is_audio = media_kind == 'audio' + f.update({ + 'format_id': 'audio' if is_audio else format_id, + 'fps': int_or_none(media_file.get('fps')), + 'vcodec': 'none' if is_audio else None, + }) + formats.append(f) + self._sort_formats(formats) + + duration = int_or_none(data.get('duration')) + + return { + 'id': video_id, + 'title': video_id, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index 5cf93591c..3bd37525b 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -195,16 +195,29 @@ class ViuOTTIE(InfoExtractor): 'skip': 'Geo-restricted to Hong Kong', }] + _AREA_ID = { + 'HK': 1, + 'SG': 2, + 'TH': 4, + 'PH': 5, + } + def _real_extract(self, url): country_code, video_id = re.match(self._VALID_URL, url).groups() + query = { + 'r': 'vod/ajax-detail', + 'platform_flag_label': 'web', + 'product_id': video_id, + } + + area_id = self._AREA_ID.get(country_code.upper()) + if area_id: + query['area_id'] = area_id + product_data = self._download_json( 'http://www.viu.com/ott/%s/index.php' % country_code, video_id, - 'Downloading video info', query={ - 'r': 'vod/ajax-detail', - 'platform_flag_label': 'web', - 'product_id': video_id, - })['data'] + 'Downloading video info', query=query)['data'] video_data = product_data.get('current_product') if not video_data: @@ -214,6 +227,9 @@ class ViuOTTIE(InfoExtractor): 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, video_id, 'Downloading stream info', query={ 'ccs_product_id': video_data['ccs_product_id'], + }, headers={ + 'Referer': url, + 'Origin': re.search(r'https?://[^/]+', url).group(0), })['data']['stream'] stream_sizes = stream_data.get('size', {}) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 29002b35f..b52d15ac6 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -17,9 +17,11 @@ from ..utils import ( int_or_none, orderedSet, remove_start, + str_or_none, str_to_int, unescapeHTML, unified_timestamp, + url_or_none, urlencode_postdata, ) from .dailymotion import DailymotionIE @@ -105,10 +107,10 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', + 'uploader_id': '-77521', 'duration': 195, - 'timestamp': 1329060660, + 'timestamp': 1329049880, 'upload_date': '20120212', - 'view_count': int, }, }, { @@ -117,12 +119,12 @@ class VKIE(VKBaseIE): 'info_dict': { 'id': '165548505', 'ext': 'mp4', - 'uploader': 'Tom Cruise', 'title': 'No name', + 'uploader': 'Tom Cruise', + 'uploader_id': '205387401', 'duration': 9, - 'timestamp': 1374374880, - 'upload_date': '20130721', - 'view_count': int, + 'timestamp': 1374364108, + 'upload_date': '20130720', } }, { @@ -206,10 +208,10 @@ class VKIE(VKBaseIE): 'id': 'V3K4mi0SYkc', 'ext': 'webm', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", - 'description': 'md5:d9903938abdc74c738af77f527ca0596', - 'duration': 178, + 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', + 'duration': 179, 'upload_date': '20130116', - 'uploader': "Children's Joy Foundation", + 'uploader': "Children's Joy Foundation Inc.", 'uploader_id': 'thecjf', 'view_count': int, }, @@ -221,6 +223,7 @@ class VKIE(VKBaseIE): 'id': 'k3lz2cmXyRuJQSjGHUv', 'ext': 'mp4', 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', + # TODO: fix test by fixing dailymotion description extraction 'description': 'md5:c651358f03c56f1150b555c26d90a0fd', 'uploader': 'AniLibria.Tv', 'upload_date': '20160914', @@ -240,9 +243,12 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'S-Dance, репетиции к The way show', 'uploader': 'THE WAY SHOW | 17 апреля', - 'timestamp': 1454870100, + 'uploader_id': '-110305615', + 'timestamp': 1454859345, 'upload_date': '20160207', - 'view_count': int, + }, + 'params': { + 'skip_download': True, }, }, { @@ -287,15 +293,19 @@ class VKIE(VKBaseIE): # This video is no longer available, because its author has been blocked. 'url': 'https://vk.com/video-10639516_456240611', 'only_matching': True, - } - ] + }, + { + # The video is not available in your region. + 'url': 'https://vk.com/video-51812607_171445436', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') if video_id: - info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id # Some videos (removed?) can only be downloaded with list id specified list_id = mobj.group('list_id') if list_id: @@ -345,6 +355,12 @@ class VKIE(VKBaseIE): r'<!>This video is no longer available, because its author has been blocked.': 'Video %s is no longer available, because its author has been blocked.', + + r'<!>This video is no longer available, because it has been deleted.': + 'Video %s is no longer available, because it has been deleted.', + + r'<!>The video .+? is not available in your region.': + 'Video %s is not available in your region.', } for error_re, error_msg in ERRORS.items(): @@ -393,7 +409,8 @@ class VKIE(VKBaseIE): if not data: data = self._parse_json( self._search_regex( - r'<!json>\s*({.+?})\s*<!>', info_page, 'json', default='{}'), + [r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'], + info_page, 'json', default='{}'), video_id) if data: data = data['player']['params'][0] @@ -415,7 +432,7 @@ class VKIE(VKBaseIE): timestamp = unified_timestamp(self._html_search_regex( r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, - 'upload date', fatal=False)) + 'upload date', default=None)) or int_or_none(data.get('date')) view_count = str_to_int(self._search_regex( r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', @@ -423,7 +440,8 @@ class VKIE(VKBaseIE): formats = [] for format_id, format_url in data.items(): - if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//', 'rtmp')): continue if (format_id.startswith(('url', 'cache')) or format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): @@ -452,9 +470,12 @@ class VKIE(VKBaseIE): 'title': title, 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), + 'uploader_id': str_or_none(data.get('author_id')), 'duration': data.get('duration'), 'timestamp': timestamp, 'view_count': view_count, + 'like_count': int_or_none(data.get('liked')), + 'dislike_count': int_or_none(data.get('nolikes')), 'is_live': is_live, } diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 64d0224e6..0b5165fd0 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -57,7 +57,7 @@ class VLiveIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s' % video_id, video_id) + 'https://www.vlive.tv/video/%s' % video_id, video_id) VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' VIDEO_PARAMS_FIELD = 'video params' @@ -108,11 +108,11 @@ class VLiveIE(InfoExtractor): def _live(self, video_id, webpage): init_page = self._download_webpage( - 'http://www.vlive.tv/video/init/view', + 'https://www.vlive.tv/video/init/view', video_id, note='Downloading live webpage', data=urlencode_postdata({'videoSeq': video_id}), headers={ - 'Referer': 'http://www.vlive.tv/video/%s' % video_id, + 'Referer': 'https://www.vlive.tv/video/%s' % video_id, 'Content-Type': 'application/x-www-form-urlencoded' }) diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py deleted file mode 100644 index 858ac9e71..000000000 --- a/youtube_dl/extractor/vporn.py +++ /dev/null @@ -1,123 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, - str_to_int, - urljoin, -) - - -class VpornIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vporn\.com/[^/]+/(?P<display_id>[^/]+)/(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://www.vporn.com/masturbation/violet-on-her-th-birthday/497944/', - 'md5': 'facf37c1b86546fa0208058546842c55', - 'info_dict': { - 'id': '497944', - 'display_id': 'violet-on-her-th-birthday', - 'ext': 'mp4', - 'title': 'Violet on her 19th birthday', - 'description': 'Violet dances in front of the camera which is sure to get you horny.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'kileyGrope', - 'categories': ['Masturbation', 'Teen'], - 'duration': 393, - 'age_limit': 18, - 'view_count': int, - }, - 'skip': 'video removed', - }, - { - 'url': 'http://www.vporn.com/female/hana-shower/523564/', - 'md5': 'ced35a4656198a1664cf2cda1575a25f', - 'info_dict': { - 'id': '523564', - 'display_id': 'hana-shower', - 'ext': 'mp4', - 'title': 'Hana Shower', - 'description': 'Hana showers at the bathroom.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Hmmmmm', - 'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female', '720p'], - 'duration': 588, - 'age_limit': 18, - 'view_count': int, - } - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - errmsg = 'This video has been deleted due to Copyright Infringement or by the account owner!' - if errmsg in webpage: - raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) - - title = self._html_search_regex( - r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip() - description = self._html_search_regex( - r'class="(?:descr|description_txt)">(.*?)</div>', - webpage, 'description', fatal=False) - thumbnail = urljoin('http://www.vporn.com', self._html_search_regex( - r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', - default=None)) - - uploader = self._html_search_regex( - r'(?s)Uploaded by:.*?<a href="/user/[^"]+"[^>]*>(.+?)</a>', - webpage, 'uploader', fatal=False) - - categories = re.findall(r'<a href="/cat/[^"]+"[^>]*>([^<]+)</a>', webpage) - - duration = parse_duration(self._search_regex( - r'Runtime:\s*</span>\s*(\d+ min \d+ sec)', - webpage, 'duration', fatal=False)) - - view_count = str_to_int(self._search_regex( - r'class="views">([\d,\.]+) [Vv]iews<', - webpage, 'view count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r"'Comments \(([\d,\.]+)\)'", - webpage, 'comment count', default=None)) - - formats = [] - - for video in re.findall(r'flashvars\.videoUrl([^=]+?)\s*=\s*"(https?://[^"]+)"', webpage): - video_url = video[1] - fmt = { - 'url': video_url, - 'format_id': video[0], - } - m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)_(?P<vbr>\d+)k\.mp4$', video_url) - if m: - fmt.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'vbr': int(m.group('vbr')), - }) - formats.append(fmt) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'categories': categories, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'age_limit': 18, - 'formats': formats, - } diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 64b13f0ed..6c060ae76 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -11,10 +11,12 @@ import time from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_urllib_parse_urlencode, compat_urllib_parse, ) from ..utils import ( + ExtractorError, float_or_none, int_or_none, ) @@ -24,29 +26,41 @@ class VRVBaseIE(InfoExtractor): _API_DOMAIN = None _API_PARAMS = {} _CMS_SIGNING = {} + _TOKEN = None + _TOKEN_SECRET = '' def _call_api(self, path, video_id, note, data=None): + # https://tools.ietf.org/html/rfc5849#section-3 base_url = self._API_DOMAIN + '/core/' + path - encoded_query = compat_urllib_parse_urlencode({ - 'oauth_consumer_key': self._API_PARAMS['oAuthKey'], - 'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), - 'oauth_signature_method': 'HMAC-SHA1', - 'oauth_timestamp': int(time.time()), - 'oauth_version': '1.0', - }) + query = [ + ('oauth_consumer_key', self._API_PARAMS['oAuthKey']), + ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])), + ('oauth_signature_method', 'HMAC-SHA1'), + ('oauth_timestamp', int(time.time())), + ] + if self._TOKEN: + query.append(('oauth_token', self._TOKEN)) + encoded_query = compat_urllib_parse_urlencode(query) headers = self.geo_verification_headers() if data: data = json.dumps(data).encode() headers['Content-Type'] = 'application/json' - method = 'POST' if data else 'GET' - base_string = '&'.join([method, compat_urllib_parse.quote(base_url, ''), compat_urllib_parse.quote(encoded_query, '')]) + base_string = '&'.join([ + 'POST' if data else 'GET', + compat_urllib_parse.quote(base_url, ''), + compat_urllib_parse.quote(encoded_query, '')]) oauth_signature = base64.b64encode(hmac.new( - (self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'), + (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'), base_string.encode(), hashlib.sha1).digest()).decode() encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '') - return self._download_json( - '?'.join([base_url, encoded_query]), video_id, - note='Downloading %s JSON metadata' % note, headers=headers, data=data) + try: + return self._download_json( + '?'.join([base_url, encoded_query]), video_id, + note='Downloading %s JSON metadata' % note, headers=headers, data=data) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True) + raise def _call_cms(self, path, video_id, note): if not self._CMS_SIGNING: @@ -55,24 +69,27 @@ class VRVBaseIE(InfoExtractor): self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) - def _set_api_params(self, webpage, video_id): - if not self._API_PARAMS: - self._API_PARAMS = self._parse_json(self._search_regex( - r'window\.__APP_CONFIG__\s*=\s*({.+?})</script>', - webpage, 'api config'), video_id)['cxApiParams'] - self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') - def _get_cms_resource(self, resource_key, video_id): return self._call_api( 'cms_resource', video_id, 'resource path', data={ 'resource_key': resource_key, })['__links__']['cms_resource']['href'] + def _real_initialize(self): + webpage = self._download_webpage( + 'https://vrv.co/', None, headers=self.geo_verification_headers()) + self._API_PARAMS = self._parse_json(self._search_regex( + [ + r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:</script>|;)', + r'window\.__APP_CONFIG__\s*=\s*({.+})' + ], webpage, 'app config'), None)['cxApiParams'] + self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') + class VRVIE(VRVBaseIE): IE_NAME = 'vrv' _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', 'info_dict': { 'id': 'GR9PNZ396', @@ -85,56 +102,72 @@ class VRVIE(VRVBaseIE): # m3u8 download 'skip_download': True, }, - } + }] + _NETRC_MACHINE = 'vrv' + + def _real_initialize(self): + super(VRVIE, self)._real_initialize() + + email, password = self._get_login_info() + if email is None: + return + + token_credentials = self._call_api( + 'authenticate/by:credentials', None, 'Token Credentials', data={ + 'email': email, + 'password': password, + }) + self._TOKEN = token_credentials['oauth_token'] + self._TOKEN_SECRET = token_credentials['oauth_token_secret'] + + def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): + if not url or stream_format not in ('hls', 'dash'): + return [] + assert audio_lang or hardsub_lang + stream_id_list = [] + if audio_lang: + stream_id_list.append('audio-%s' % audio_lang) + if hardsub_lang: + stream_id_list.append('hardsub-%s' % hardsub_lang) + stream_id = '-'.join(stream_id_list) + format_id = '%s-%s' % (stream_format, stream_id) + if stream_format == 'hls': + adaptive_formats = self._extract_m3u8_formats( + url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s m3u8 information' % stream_id, + fatal=False) + elif stream_format == 'dash': + adaptive_formats = self._extract_mpd_formats( + url, video_id, mpd_id=format_id, + note='Downloading %s MPD information' % stream_id, + fatal=False) + if audio_lang: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_lang + return adaptive_formats def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, - headers=self.geo_verification_headers()) - media_resource = self._parse_json(self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?})</script>', - webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {} - video_data = media_resource.get('json') - if not video_data: - self._set_api_params(webpage, video_id) - episode_path = self._get_cms_resource( - 'cms:/episodes/' + video_id, video_id) - video_data = self._call_cms(episode_path, video_id, 'video') + episode_path = self._get_cms_resource( + 'cms:/episodes/' + video_id, video_id) + video_data = self._call_cms(episode_path, video_id, 'video') title = video_data['title'] - streams_json = media_resource.get('streams', {}).get('json', {}) - if not streams_json: - self._set_api_params(webpage, video_id) - streams_path = video_data['__links__']['streams']['href'] - streams_json = self._call_cms(streams_path, video_id, 'streams') + streams_path = video_data['__links__'].get('streams', {}).get('href') + if not streams_path: + self.raise_login_required() + streams_json = self._call_cms(streams_path, video_id, 'streams') audio_locale = streams_json.get('audio_locale') formats = [] for stream_type, streams in streams_json.get('streams', {}).items(): if stream_type in ('adaptive_hls', 'adaptive_dash'): for stream in streams.values(): - stream_url = stream.get('url') - if not stream_url: - continue - stream_id = stream.get('hardsub_locale') or audio_locale - format_id = '%s-%s' % (stream_type.split('_')[1], stream_id) - if stream_type == 'adaptive_hls': - adaptive_formats = self._extract_m3u8_formats( - stream_url, video_id, 'mp4', m3u8_id=format_id, - note='Downloading %s m3u8 information' % stream_id, - fatal=False) - else: - adaptive_formats = self._extract_mpd_formats( - stream_url, video_id, mpd_id=format_id, - note='Downloading %s MPD information' % stream_id, - fatal=False) - if audio_locale: - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = audio_locale - formats.extend(adaptive_formats) + formats.extend(self._extract_vrv_formats( + stream.get('url'), video_id, stream_type.split('_')[1], + audio_locale, stream.get('hardsub_locale'))) self._sort_formats(formats) subtitles = {} @@ -190,11 +223,7 @@ class VRVSeriesIE(VRVBaseIE): def _real_extract(self, url): series_id = self._match_id(url) - webpage = self._download_webpage( - url, series_id, - headers=self.geo_verification_headers()) - self._set_api_params(webpage, series_id) seasons_path = self._get_cms_resource( 'cms:/seasons?series_id=' + series_id, series_id) seasons_data = self._call_cms(seasons_path, series_id, 'seasons') diff --git a/youtube_dl/extractor/vshare.py b/youtube_dl/extractor/vshare.py index e4ec77889..c631ac1fa 100644 --- a/youtube_dl/extractor/vshare.py +++ b/youtube_dl/extractor/vshare.py @@ -48,7 +48,7 @@ class VShareIE(InfoExtractor): webpage = self._download_webpage( 'https://vshare.io/v/%s/width-650/height-430/1' % video_id, - video_id) + video_id, headers={'Referer': url}) title = self._html_search_regex( r'<title>([^<]+)', webpage, 'title') diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py index 02fcd52c7..6000671c3 100644 --- a/youtube_dl/extractor/vzaar.py +++ b/youtube_dl/extractor/vzaar.py @@ -4,15 +4,19 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, float_or_none, + unified_timestamp, + url_or_none, ) class VzaarIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P\d+)' _TESTS = [{ + # HTTP and HLS 'url': 'https://vzaar.com/videos/1152805', 'md5': 'bde5ddfeb104a6c56a93a06b04901dbf', 'info_dict': { @@ -40,24 +44,48 @@ class VzaarIE(InfoExtractor): video_id = self._match_id(url) video_data = self._download_json( 'http://view.vzaar.com/v2/%s/video' % video_id, video_id) - source_url = video_data['sourceUrl'] - info = { + title = video_data['videoTitle'] + + formats = [] + + source_url = url_or_none(video_data.get('sourceUrl')) + if source_url: + f = { + 'url': source_url, + 'format_id': 'http', + } + if 'audio' in source_url: + f.update({ + 'vcodec': 'none', + 'ext': 'mp3', + }) + else: + f.update({ + 'width': int_or_none(video_data.get('width')), + 'height': int_or_none(video_data.get('height')), + 'ext': 'mp4', + 'fps': float_or_none(video_data.get('fps')), + }) + formats.append(f) + + video_guid = video_data.get('guid') + usp = video_data.get('usp') + if isinstance(video_guid, compat_str) and isinstance(usp, dict): + m3u8_url = ('http://fable.vzaar.com/v4/usp/%s/%s.ism/.m3u8?' + % (video_guid, video_id)) + '&'.join( + '%s=%s' % (k, v) for k, v in usp.items()) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + + return { 'id': video_id, - 'title': video_data['videoTitle'], - 'url': source_url, + 'title': title, 'thumbnail': self._proto_relative_url(video_data.get('poster')), 'duration': float_or_none(video_data.get('videoDuration')), + 'timestamp': unified_timestamp(video_data.get('ts')), + 'formats': formats, } - if 'audio' in source_url: - info.update({ - 'vcodec': 'none', - 'ext': 'mp3', - }) - else: - info.update({ - 'width': int_or_none(video_data.get('width')), - 'height': int_or_none(video_data.get('height')), - 'ext': 'mp4', - }) - return info diff --git a/youtube_dl/extractor/wakanim.py b/youtube_dl/extractor/wakanim.py new file mode 100644 index 000000000..f9a2395d9 --- /dev/null +++ b/youtube_dl/extractor/wakanim.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + merge_dicts, + urljoin, +) + + +class WakanimIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?wakanim\.tv/[^/]+/v2/catalogue/episode/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/2997/the-asterisk-war-omu-staffel-1-episode-02-omu', + 'info_dict': { + 'id': '2997', + 'ext': 'mp4', + 'title': 'Episode 02', + 'description': 'md5:2927701ea2f7e901de8bfa8d39b2852d', + 'series': 'The Asterisk War (OmU.)', + 'season_number': 1, + 'episode': 'Episode 02', + 'episode_number': 2, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + # DRM Protected + 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + m3u8_url = urljoin(url, self._search_regex( + r'file\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'm3u8 url', + group='url')) + # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls + encryption = self._search_regex( + r'encryption%3D(c(?:enc|bc(?:s-aapl)?))', + m3u8_url, 'encryption', default=None) + if encryption and encryption in ('cenc', 'cbcs-aapl'): + raise ExtractorError('This video is DRM protected.', expected=True) + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + info = self._search_json_ld(webpage, video_id, default={}) + + title = self._search_regex( + (r']+\bclass=["\']episode_h1[^>]+\btitle=(["\'])(?P(?:(?!\1).)+)\1', + r'<span[^>]+\bclass=["\']episode_title["\'][^>]*>(?P<title>[^<]+)'), + webpage, 'title', default=None, group='title') + + return merge_dicts(info, { + 'id': video_id, + 'title': title, + 'formats': formats, + }) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 20fef1f04..8ef3e0906 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -19,7 +19,6 @@ class WatIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', - 'md5': '83d882d9de5c9d97f0bb2c6273cde56a', 'info_dict': { 'id': '11713067', 'ext': 'mp4', @@ -28,10 +27,15 @@ class WatIE(InfoExtractor): 'upload_date': '20140819', 'duration': 120, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', + 'md5': 'b16574df2c3cd1a36ca0098f2a791925', 'info_dict': { 'id': '11713075', 'ext': 'mp4', @@ -98,38 +102,25 @@ class WatIE(InfoExtractor): formats = [] try: + alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] manifest_urls = self._download_json( 'http://www.wat.tv/get/webhtml/' + video_id, video_id) m3u8_url = manifest_urls.get('hls') if m3u8_url: m3u8_url = remove_bitrate_limit(m3u8_url) - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + for m3u8_alt_url in alt_urls(m3u8_url): + formats.extend(self._extract_m3u8_formats( + m3u8_alt_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios', 'web').replace('.m3u8', '.f4m'), + m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) - http_url = extract_url('android5/%s.mp4', 'http') - if http_url: - for m3u8_format in m3u8_formats: - vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') - if not vbr or not abr: - continue - format_id = m3u8_format['format_id'].replace('hls', 'http') - fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) - if self._is_valid_url(fmt_url, video_id, format_id): - f = m3u8_format.copy() - f.update({ - 'url': fmt_url, - 'format_id': format_id, - 'protocol': 'http', - }) - formats.append(f) mpd_url = manifest_urls.get('mpd') if mpd_url: - formats.extend(self._extract_mpd_formats(remove_bitrate_limit( - mpd_url), video_id, mpd_id='dash', fatal=False)) + mpd_url = remove_bitrate_limit(mpd_url) + for mpd_alt_url in alt_urls(mpd_url): + formats.extend(self._extract_mpd_formats( + mpd_alt_url, video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) except ExtractorError: abr = 64 diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py index be0bcba15..5a4e46e73 100644 --- a/youtube_dl/extractor/watchbox.py +++ b/youtube_dl/extractor/watchbox.py @@ -10,6 +10,7 @@ from ..utils import ( js_to_json, strip_or_none, try_get, + unescapeHTML, unified_timestamp, ) @@ -67,11 +68,20 @@ class WatchBoxIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - source = self._parse_json( + player_config = self._parse_json( self._search_regex( - r'(?s)source["\']?\s*:\s*({.+?})\s*[,}]', webpage, 'source', - default='{}'), - video_id, transform_source=js_to_json, fatal=False) or {} + r'data-player-conf=(["\'])(?P<data>{.+?})\1', webpage, + 'player config', default='{}', group='data'), + video_id, transform_source=unescapeHTML, fatal=False) + + if not player_config: + player_config = self._parse_json( + self._search_regex( + r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) or {} + + source = player_config.get('source') or {} video_id = compat_str(source.get('videoId') or video_id) diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py index 1eb1f6702..f2b8d19b4 100644 --- a/youtube_dl/extractor/webofstories.py +++ b/youtube_dl/extractor/webofstories.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + orderedSet, +) class WebOfStoriesIE(InfoExtractor): @@ -133,8 +136,10 @@ class WebOfStoriesPlaylistIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) entries = [ - self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories') - for video_number in set(re.findall(r'href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage)) + self.url_result( + 'http://www.webofstories.com/play/%s' % video_id, + 'WebOfStories', video_id=video_id) + for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage)) ] title = self._search_regex( diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 3dab9145b..ea234e3c5 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -40,11 +40,7 @@ class WimpIE(InfoExtractor): r'data-id=["\']([0-9A-Za-z_-]{11})'), webpage, 'video URL', default=None) if youtube_id: - return { - '_type': 'url', - 'url': youtube_id, - 'ie_key': YoutubeIE.ie_key(), - } + return self.url_result(youtube_id, YoutubeIE.ie_key()) info_dict = self._extract_jwplayer_data( webpage, video_id, require_title=False) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 2182d6fd4..fa142b974 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -12,7 +12,7 @@ from ..utils import ( class WistiaIE(InfoExtractor): - _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.net/embed/iframe/)(?P<id>[a-z0-9]+)' + _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P<id>[a-z0-9]+)' _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' @@ -35,12 +35,18 @@ class WistiaIE(InfoExtractor): # with hls video 'url': 'wistia:807fafadvk', 'only_matching': True, + }, { + 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt', + 'only_matching': True, + }, { + 'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/iframe/.+?)\1', webpage) if match: return unescapeHTML(match.group('url')) diff --git a/youtube_dl/extractor/wwe.py b/youtube_dl/extractor/wwe.py new file mode 100644 index 000000000..bebc77bb5 --- /dev/null +++ b/youtube_dl/extractor/wwe.py @@ -0,0 +1,140 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + try_get, + unescapeHTML, + url_or_none, + urljoin, +) + + +class WWEBaseIE(InfoExtractor): + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + } + + def _extract_entry(self, data, url, video_id=None): + video_id = compat_str(video_id or data['nid']) + title = data['title'] + + formats = self._extract_m3u8_formats( + data['file'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + description = data.get('description') + thumbnail = urljoin(url, data.get('image')) + series = data.get('show_name') + episode = data.get('episode_name') + + subtitles = {} + tracks = data.get('tracks') + if isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + track_file = url_or_none(track.get('file')) + if not track_file: + continue + label = track.get('label') + lang = self._SUBTITLE_LANGS.get(label, label) or 'en' + subtitles.setdefault(lang, []).append({ + 'url': track_file, + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'series': series, + 'episode': episode, + 'formats': formats, + 'subtitles': subtitles, + } + + +class WWEIE(WWEBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*videos/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.wwe.com/videos/daniel-bryan-vs-andrade-cien-almas-smackdown-live-sept-4-2018', + 'md5': '92811c6a14bfc206f7a6a9c5d9140184', + 'info_dict': { + 'id': '40048199', + 'ext': 'mp4', + 'title': 'Daniel Bryan vs. Andrade "Cien" Almas: SmackDown LIVE, Sept. 4, 2018', + 'description': 'md5:2d7424dbc6755c61a0e649d2a8677f67', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://de.wwe.com/videos/gran-metalik-vs-tony-nese-wwe-205-live-sept-4-2018', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + landing = self._parse_json( + self._html_search_regex( + r'(?s)Drupal\.settings\s*,\s*({.+?})\s*\)\s*;', + webpage, 'drupal settings'), + display_id)['WWEVideoLanding'] + + data = landing['initialVideo']['playlist'][0] + video_id = landing.get('initialVideoId') + + info = self._extract_entry(data, url, video_id) + info['display_id'] = display_id + return info + + +class WWEPlaylistIE(WWEBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.wwe.com/shows/raw/2018-11-12', + 'info_dict': { + 'id': '2018-11-12', + }, + 'playlist_mincount': 11, + }, { + 'url': 'http://www.wwe.com/article/walk-the-prank-wwe-edition', + 'only_matching': True, + }, { + 'url': 'https://www.wwe.com/shows/wwenxt/article/matt-riddle-interview', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if WWEIE.suitable(url) else super(WWEPlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'data-video\s*=\s*(["\'])(?P<data>{.+?})\1', webpage): + video = self._parse_json( + mobj.group('data'), display_id, transform_source=unescapeHTML, + fatal=False) + if not video: + continue + data = try_get(video, lambda x: x['playlist'][0], dict) + if not data: + continue + try: + entry = self._extract_entry(data, url) + except Exception: + continue + entry['extractor_key'] = WWEIE.ie_key() + entries.append(entry) + + return self.playlist_result(entries, display_id) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index bc3239f68..b38c7a7b3 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -23,7 +23,7 @@ class XFileShareIE(InfoExtractor): (r'powerwatch\.pw', 'PowerWatch'), (r'rapidvideo\.ws', 'Rapidvideo.ws'), (r'thevideobee\.to', 'TheVideoBee'), - (r'vidto\.me', 'Vidto'), + (r'vidto\.(?:me|se)', 'Vidto'), (r'streamin\.to', 'Streamin.To'), (r'xvidstage\.com', 'XVIDSTAGE'), (r'vidabc\.com', 'Vid ABC'), @@ -115,7 +115,10 @@ class XFileShareIE(InfoExtractor): 'only_matching': True, }, { 'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html', - 'only_matching': True + 'only_matching': True, + }, { + 'url': 'http://vidto.se/1tx1pf6t12cg.html', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index d1bc992fd..68a48034e 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -13,6 +13,7 @@ from ..utils import ( parse_duration, try_get, unified_strdate, + url_or_none, ) @@ -137,7 +138,8 @@ class XHamsterIE(InfoExtractor): else: format_url = format_item filesize = None - if not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue formats.append({ 'format_id': '%s-%s' % (format_id, quality), @@ -198,7 +200,8 @@ class XHamsterIE(InfoExtractor): default='{}'), video_id, fatal=False) for format_id, format_url in sources.items(): - if not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue if format_url in format_urls: continue diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index efee95651..ec2d913fc 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -45,7 +45,7 @@ class XVideosIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.xvideos.com/video%s/' % video_id, video_id) + 'https://www.xvideos.com/video%s/' % video_id, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: diff --git a/youtube_dl/extractor/yapfiles.py b/youtube_dl/extractor/yapfiles.py index 7fafbf596..cfb368de9 100644 --- a/youtube_dl/extractor/yapfiles.py +++ b/youtube_dl/extractor/yapfiles.py @@ -4,12 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, qualities, unescapeHTML, + url_or_none, ) @@ -80,9 +80,9 @@ class YapFilesIE(InfoExtractor): formats = [] for format_id in QUALITIES: is_hd = format_id == 'hd' - format_url = playlist.get( - 'file%s' % ('_hd' if is_hd else '')) - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(playlist.get( + 'file%s' % ('_hd' if is_hd else ''))) + if not format_url: continue formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index f33fabe19..dff69fcb7 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -3,11 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, parse_duration, + url_or_none, ) @@ -50,8 +50,8 @@ class YouJizzIE(InfoExtractor): for encoding in encodings: if not isinstance(encoding, dict): continue - format_url = encoding.get('filename') - if not isinstance(format_url, compat_str): + format_url = url_or_none(encoding.get('filename')) + if not format_url: continue if determine_ext(format_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 547adefeb..d4eccb4b2 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, sanitized_Request, str_to_int, unescapeHTML, unified_strdate, + url_or_none, ) from ..aes import aes_decrypt_text @@ -68,11 +68,9 @@ class YouPornIE(InfoExtractor): request.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(request, display_id) - title = self._search_regex( - [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>(?:(?!\1).)+)\1', - r'<h1[^>]+class=["\']heading\d?["\'][^>]*>(?P<title>[^<]+)<'], - webpage, 'title', group='title', - default=None) or self._og_search_title( + title = self._html_search_regex( + r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>', + webpage, 'title', default=None) or self._og_search_title( webpage, default=None) or self._html_search_meta( 'title', webpage, fatal=True) @@ -88,8 +86,8 @@ class YouPornIE(InfoExtractor): for definition in definitions: if not isinstance(definition, dict): continue - video_url = definition.get('videoUrl') - if isinstance(video_url, compat_str) and video_url: + video_url = url_or_none(definition.get('videoUrl')) + if video_url: links.append(video_url) # Fallback #1, this also contains extra low quality 180p format @@ -134,7 +132,11 @@ class YouPornIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - description = self._og_search_description(webpage, default=None) + description = self._html_search_regex( + r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>', + webpage, 'description', + default=None) or self._og_search_description( + webpage, default=None) thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py new file mode 100644 index 000000000..2c63f9752 --- /dev/null +++ b/youtube_dl/extractor/yourporn.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + urljoin, +) + + +class YourPornIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?yourporn\.sexy/post/(?P<id>[^/?#&.]+)' + _TEST = { + 'url': 'https://yourporn.sexy/post/57ffcb2e1179b.html', + 'md5': '6f8682b6464033d87acaa7a8ff0c092e', + 'info_dict': { + 'id': '57ffcb2e1179b', + 'ext': 'mp4', + 'title': 'md5:c9f43630bd968267672651ba905a7d35', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 165, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = urljoin(url, self._parse_json( + self._search_regex( + r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info', + group='data'), + video_id)[video_id]).replace('/cdn/', '/cdn4/') + + title = (self._search_regex( + r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title', + default=None) or self._og_search_description(webpage)).strip() + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'duration\s*:\s*<[^>]+>([\d:]+)', webpage, 'duration', + default=None)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 89c8b7f8d..c8bf98b58 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -41,12 +41,14 @@ from ..utils import ( remove_quotes, remove_start, smuggle_url, + str_or_none, str_to_int, try_get, unescapeHTML, unified_strdate, unsmuggle_url, uppercase_escape, + url_or_none, urlencode_postdata, ) @@ -64,7 +66,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}' + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' def _set_language(self): self._set_cookie( @@ -178,13 +180,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): warn('Unable to extract result entry') return False - tfa = try_get(res, lambda x: x[0][0], list) - if tfa: - tfa_str = try_get(tfa, lambda x: x[2], compat_str) - if tfa_str == 'TWO_STEP_VERIFICATION': + login_challenge = try_get(res, lambda x: x[0][0], list) + if login_challenge: + challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) + if challenge_str == 'TWO_STEP_VERIFICATION': # SEND_SUCCESS - TFA code has been successfully sent to phone # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(tfa, lambda x: x[5], compat_str) + status = try_get(login_challenge, lambda x: x[5], compat_str) if status == 'QUOTA_EXCEEDED': warn('Exceeded the limit of TFA codes, try later') return False @@ -228,6 +230,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): check_cookie_url = try_get( tfa_results, lambda x: x[0][-1][2], compat_str) + else: + CHALLENGES = { + 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", + 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', + 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", + } + challenge = CHALLENGES.get( + challenge_str, + '%s returned error %s.' % (self.IE_NAME, challenge_str)) + warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) + return False else: check_cookie_url = try_get(res, lambda x: x[2], compat_str) @@ -248,7 +261,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True def _download_webpage_handle(self, *args, **kwargs): - kwargs.setdefault('query', {})['disable_polymer'] = 'true' + query = kwargs.get('query', {}).copy() + query['disable_polymer'] = 'true' + kwargs['query'] = query return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) @@ -336,6 +351,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| + (?:www\.)?invidio\.us/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -479,12 +495,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', - 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -508,7 +526,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', - 'license': 'Standard YouTube License', 'creator': 'Icona Pop', 'track': 'I Love It (feat. Charli XCX)', 'artist': 'Icona Pop', @@ -521,14 +538,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': '07FYdnEawAQ', 'ext': 'mp4', 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', + 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', 'alt_title': 'Tunnel Vision', - 'description': 'md5:64249768eec3bc4276236606ea996373', + 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', 'duration': 419, 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', - 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', 'track': 'Tunnel Vision', 'artist': 'Justin Timberlake', @@ -547,7 +563,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'SET India', 'uploader_id': 'setindia', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', - 'license': 'Standard YouTube License', 'age_limit': 18, } }, @@ -562,11 +577,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', - 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -585,7 +600,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', 'uploader': '8KVIDEO', - 'license': 'Standard YouTube License', 'title': 'UHDTV TEST 8K VIDEO.mp4' }, 'params': { @@ -600,13 +614,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'IB3lcPjvWLA', 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', - 'description': 'md5:1900ed86ee514927b9e00fbead6969a5', + 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', + 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', - 'license': 'Standard YouTube License', }, 'params': { 'youtube_include_dash_manifest': True, @@ -620,13 +633,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', - 'alt_title': 'Shake It Off', - 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', + 'description': 'md5:bec2185232c05479482cb5a9b82719bf', 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', - 'license': 'Standard YouTube License', 'creator': 'Taylor Swift', }, 'params': { @@ -642,10 +653,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 219, 'upload_date': '20100909', - 'uploader': 'TJ Kirk', + 'uploader': 'Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', - 'license': 'Standard YouTube License', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } @@ -663,7 +673,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'WitcherGame', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', - 'license': 'Standard YouTube License', 'age_limit': 18, }, }, @@ -672,7 +681,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', 'info_dict': { 'id': '6kLq3WMV1nU', - 'ext': 'webm', + 'ext': 'mp4', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', 'duration': 246, @@ -680,7 +689,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', 'upload_date': '20110629', - 'license': 'Standard YouTube License', 'age_limit': 18, }, }, @@ -698,7 +706,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'creator': 'deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', - 'license': 'Standard YouTube License', 'title': 'Deadmau5 - Some Chords (HD)', 'alt_title': 'Some Chords', }, @@ -716,7 +723,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150827', 'uploader_id': 'olympic', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'license': 'Standard YouTube License', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympic', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', @@ -738,7 +744,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', 'uploader': '孫ᄋᄅ', - 'license': 'Standard YouTube License', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, }, @@ -772,7 +777,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'dorappi2000', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', - 'license': 'Standard YouTube License', 'formats': 'mincount:31', }, 'skip': 'not actual anymore', @@ -788,7 +792,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Airtek', 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', - 'license': 'Standard YouTube License', 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', }, 'params': { @@ -861,6 +864,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', }, { # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536) @@ -897,7 +901,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', - 'license': 'Standard YouTube License', 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', 'track': 'Dark Walk - Position Music', 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', @@ -1001,13 +1004,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:25b78d2f64ae81719f5c96319889b736', + 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f', 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', 'uploader_id': 'Vsauce', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', - 'license': 'Standard YouTube License', 'series': 'Mind Field', 'season_number': 1, 'episode_number': 1, @@ -1033,7 +1035,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'New Century Foundation', 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', - 'license': 'Standard YouTube License', }, 'params': { 'skip_download': True, @@ -1053,6 +1054,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', 'only_matching': True, }, + { + 'url': 'https://invidio.us/watch?v=BaW_jenozKc', + 'only_matching': True, + }, + { + # DRM protected + 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', + 'only_matching': True, + }, + { + # Video with unsupported adaptive stream type formats + 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U', + 'info_dict': { + 'id': 'Z4Vy8R84T1U', + 'ext': 'mp4', + 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 433, + 'upload_date': '20130923', + 'uploader': 'Amelia Putri Harwita', + 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', + 'formats': 'maxcount:10', + }, + 'params': { + 'skip_download': True, + 'youtube_include_dash_manifest': False, + }, + } ] def __init__(self, *args, **kwargs): @@ -1081,7 +1111,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', + r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1167,7 +1197,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\('), + r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', + r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) @@ -1360,8 +1393,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning(err_msg) return {} - def _mark_watched(self, video_id, video_info): - playback_url = video_info.get('videostats_playback_base_url', [None])[0] + def _mark_watched(self, video_id, video_info, player_response): + playback_url = url_or_none(try_get( + player_response, + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( + video_info, lambda x: x['videostats_playback_base_url'][0])) if not playback_url: return parsed_playback_url = compat_urlparse.urlparse(playback_url) @@ -1510,12 +1546,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) + def add_dash_mpd_pr(pl_response): + dash_mpd = url_or_none(try_get( + pl_response, lambda x: x['streamingData']['dashManifestUrl'], + compat_str)) + if dash_mpd and dash_mpd not in dash_mpds: + dash_mpds.append(dash_mpd) + is_live = None view_count = None def extract_view_count(v_info): return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + player_response = {} + # Get video info embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: @@ -1558,7 +1603,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True sts = ytplayer_config.get('sts') + if not player_response: + pl_response = str_or_none(args.get('player_response')) + if pl_response: + pl_response = self._parse_json(pl_response, video_id, fatal=False) + if isinstance(pl_response, dict): + player_response = pl_response if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): + add_dash_mpd_pr(player_response) # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH @@ -1586,6 +1638,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info_webpage: continue get_video_info = compat_parse_qs(video_info_webpage) + if not player_response: + pl_response = get_video_info.get('player_response', [None])[0] + if isinstance(pl_response, dict): + player_response = pl_response + add_dash_mpd_pr(player_response) add_dash_mpd(get_video_info) if view_count is None: view_count = extract_view_count(get_video_info) @@ -1631,9 +1688,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '"token" parameter not in video info for unknown reason', video_id=video_id) + if video_info.get('license_info'): + raise ExtractorError('This video is DRM protected.', expected=True) + + video_details = try_get( + player_response, lambda x: x['videoDetails'], dict) or {} + # title if 'title' in video_info: video_title = video_info['title'][0] + elif 'title' in player_response: + video_title = video_details['title'] else: self._downloader.report_warning('Unable to extract video title') video_title = '_' @@ -1669,33 +1734,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_description = '' - if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False): + if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): - entries = [] - feed_ids = [] - multifeed_metadata_list = video_info['multifeed_metadata_list'][0] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/rg3/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), - }) - feed_ids.append(feed_data['id'][0]) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + multifeed_metadata_list = try_get( + player_response, + lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], + compat_str) or try_get( + video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) + if multifeed_metadata_list: + entries = [] + feed_ids = [] + for feed in multifeed_metadata_list.split(','): + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/rg3/youtube-dl/issues/8536) + feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + }) + feed_ids.append(feed_data['id'][0]) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result(entries, video_id, video_title, video_description) + else: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) if view_count is None: view_count = extract_view_count(video_info) + if view_count is None and video_details: + view_count = int_or_none(video_details.get('viewCount')) # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: @@ -1731,11 +1804,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'height': int_or_none(width_height[1]), } q = qualities(['small', 'medium', 'hd720']) + streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) + if streaming_formats: + for fmt in streaming_formats: + itag = str_or_none(fmt.get('itag')) + if not itag: + continue + quality = fmt.get('quality') + quality_label = fmt.get('qualityLabel') or quality + formats_spec[itag] = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_note': quality_label, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + # bitrate for itag 43 is always 2147483647 + 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, + 'width': int_or_none(fmt.get('width')), + } formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) if 'itag' not in url_data or 'url' not in url_data: continue + stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) + # Unsupported FORMAT_STREAM_TYPE_OTF + if stream_type == 3: + continue format_id = url_data['itag'][0] url = url_data['url'][0] @@ -1779,7 +1875,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_version = self._search_regex( [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version @@ -1813,7 +1909,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): filesize = int_or_none(url_data.get( 'clen', [None])[0]) or _extract_filesize(url) - quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0] + quality = url_data.get('quality', [None])[0] more_fields = { 'filesize': filesize, @@ -1821,7 +1917,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': width, 'height': height, 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': quality, + 'format_note': url_data.get('quality_label', [None])[0] or quality, 'quality': q(quality), } for key, value in more_fields.items(): @@ -1849,34 +1945,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'http_chunk_size': 10485760, } formats.append(dct) - elif video_info.get('hlsvp'): - manifest_url = video_info['hlsvp'][0] - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) else: - error_message = clean_html(video_info.get('reason', [None])[0]) - if not error_message: - error_message = extract_unavailable_message() - if error_message: - raise ExtractorError(error_message, expected=True) - raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + manifest_url = ( + url_or_none(try_get( + player_response, + lambda x: x['streamingData']['hlsManifestUrl'], + compat_str)) or + url_or_none(try_get( + video_info, lambda x: x['hlsvp'][0], compat_str))) + if manifest_url: + formats = [] + m3u8_formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', fatal=False) + for a_format in m3u8_formats: + itag = self._search_regex( + r'/itag/(\d+)/', a_format['url'], 'itag', default=None) + if itag: + a_format['format_id'] = itag + if itag in self._formats: + dct = self._formats[itag].copy() + dct.update(a_format) + a_format = dct + a_format['player_url'] = player_url + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' + formats.append(a_format) + else: + error_message = clean_html(video_info.get('reason', [None])[0]) + if not error_message: + error_message = extract_unavailable_message() + if error_message: + raise ExtractorError(error_message, expected=True) + raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') # uploader - video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + video_uploader = try_get( + video_info, lambda x: x['author'][0], + compat_str) or str_or_none(video_details.get('author')) if video_uploader: video_uploader = compat_urllib_parse_unquote_plus(video_uploader) else: @@ -1894,6 +1999,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: self._downloader.report_warning('unable to extract uploader nickname') + channel_id = self._html_search_meta( + 'channelId', video_webpage, 'channel id') + channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + # thumbnail image # We try first to get a high quality image: m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', @@ -1955,7 +2064,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', video_webpage) if m_episode: - series = m_episode.group('series') + series = unescapeHTML(m_episode.group('series')) season_number = int(m_episode.group('season')) episode_number = int(m_episode.group('episode')) else: @@ -1985,12 +2094,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): like_count = _extract_count('like') dislike_count = _extract_count('dislike') + if view_count is None: + view_count = str_to_int(self._search_regex( + r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, + 'view count', default=None)) + # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) video_duration = try_get( video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = int_or_none(video_details.get('lengthSeconds')) if not video_duration: video_duration = parse_duration(self._html_search_meta( 'duration', video_webpage, 'video duration')) @@ -2058,13 +2174,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._sort_formats(formats) - self.mark_watched(video_id, video_info) + self.mark_watched(video_id, video_info, player_response) return { 'id': video_id, 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'uploader_url': video_uploader_url, + 'channel_id': channel_id, + 'channel_url': channel_url, 'upload_date': upload_date, 'license': video_license, 'creator': video_creator or artist, @@ -2103,7 +2221,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): (?:https?://)? (?:\w+\.)? (?: - youtube\.com/ + (?: + youtube\.com| + invidio\.us + ) + / (?: (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) \? (?:.*?[&;])*? (?:p|a|list)= @@ -2112,7 +2234,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) @@ -2216,6 +2338,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', 'categories': ['People & Blogs'], 'tags': list, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -2250,6 +2373,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, + }, { + 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'only_matching': True, }] def _real_initialize(self): @@ -2392,7 +2522,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' + _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' @@ -2413,6 +2543,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'id': 'UUs0ifCMCm1icqRbqhUINa0w', 'title': 'Uploads from Deus Ex', }, + }, { + 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'only_matching': True, }] @classmethod diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index b5a3a0716..ee514666b 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -13,16 +13,17 @@ from ..utils import ( ExtractorError, int_or_none, try_get, + url_or_none, urlencode_postdata, ) -class ZattooBaseIE(InfoExtractor): - _NETRC_MACHINE = 'zattoo' - _HOST_URL = 'https://zattoo.com' - +class ZattooPlatformBaseIE(InfoExtractor): _power_guide_hash = None + def _host_url(self): + return 'https://%s' % (self._API_HOST if hasattr(self, '_API_HOST') else self._HOST) + def _login(self): username, password = self._get_login_info() if not username or not password: @@ -32,13 +33,13 @@ class ZattooBaseIE(InfoExtractor): try: data = self._download_json( - '%s/zapi/v2/account/login' % self._HOST_URL, None, 'Logging in', + '%s/zapi/v2/account/login' % self._host_url(), None, 'Logging in', data=urlencode_postdata({ 'login': username, 'password': password, 'remember': 'true', }), headers={ - 'Referer': '%s/login' % self._HOST_URL, + 'Referer': '%s/login' % self._host_url(), 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }) except ExtractorError as e: @@ -52,7 +53,7 @@ class ZattooBaseIE(InfoExtractor): def _real_initialize(self): webpage = self._download_webpage( - self._HOST_URL, None, 'Downloading app token') + self._host_url(), None, 'Downloading app token') app_token = self._html_search_regex( r'appToken\s*=\s*(["\'])(?P<token>(?:(?!\1).)+?)\1', webpage, 'app token', group='token') @@ -61,7 +62,7 @@ class ZattooBaseIE(InfoExtractor): # Will setup appropriate cookies self._request_webpage( - '%s/zapi/v2/session/hello' % self._HOST_URL, None, + '%s/zapi/v2/session/hello' % self._host_url(), None, 'Opening session', data=urlencode_postdata({ 'client_app_token': app_token, 'uuid': compat_str(uuid4()), @@ -74,7 +75,7 @@ class ZattooBaseIE(InfoExtractor): def _extract_cid(self, video_id, channel_name): channel_groups = self._download_json( - '%s/zapi/v2/cached/channels/%s' % (self._HOST_URL, + '%s/zapi/v2/cached/channels/%s' % (self._host_url(), self._power_guide_hash), video_id, 'Downloading channel list', query={'details': False})['channel_groups'] @@ -92,28 +93,30 @@ class ZattooBaseIE(InfoExtractor): def _extract_cid_and_video_info(self, video_id): data = self._download_json( - '%s/zapi/program/details' % self._HOST_URL, + '%s/zapi/v2/cached/program/power_details/%s' % ( + self._host_url(), self._power_guide_hash), video_id, 'Downloading video information', query={ - 'program_id': video_id, - 'complete': True + 'program_ids': video_id, + 'complete': True, }) - p = data['program'] + p = data['programs'][0] cid = p['cid'] info_dict = { 'id': video_id, - 'title': p.get('title') or p['episode_title'], - 'description': p.get('description'), - 'thumbnail': p.get('image_url'), + 'title': p.get('t') or p['et'], + 'description': p.get('d'), + 'thumbnail': p.get('i_url'), 'creator': p.get('channel_name'), - 'episode': p.get('episode_title'), - 'episode_number': int_or_none(p.get('episode_number')), - 'season_number': int_or_none(p.get('season_number')), + 'episode': p.get('et'), + 'episode_number': int_or_none(p.get('e_no')), + 'season_number': int_or_none(p.get('s_no')), 'release_year': int_or_none(p.get('year')), - 'categories': try_get(p, lambda x: x['categories'], list), + 'categories': try_get(p, lambda x: x['c'], list), + 'tags': try_get(p, lambda x: x['g'], list) } return cid, info_dict @@ -125,11 +128,11 @@ class ZattooBaseIE(InfoExtractor): if is_live: postdata_common.update({'timeshift': 10800}) - url = '%s/zapi/watch/live/%s' % (self._HOST_URL, cid) + url = '%s/zapi/watch/live/%s' % (self._host_url(), cid) elif record_id: - url = '%s/zapi/watch/recording/%s' % (self._HOST_URL, record_id) + url = '%s/zapi/watch/recording/%s' % (self._host_url(), record_id) else: - url = '%s/zapi/watch/recall/%s/%s' % (self._HOST_URL, cid, video_id) + url = '%s/zapi/watch/recall/%s/%s' % (self._host_url(), cid, video_id) formats = [] for stream_type in ('dash', 'hls', 'hls5', 'hds'): @@ -150,8 +153,8 @@ class ZattooBaseIE(InfoExtractor): for watch in watch_urls: if not isinstance(watch, dict): continue - watch_url = watch.get('url') - if not watch_url or not isinstance(watch_url, compat_str): + watch_url = url_or_none(watch.get('url')) + if not watch_url: continue format_id_list = [stream_type] maxrate = watch.get('maxrate') @@ -200,13 +203,13 @@ class ZattooBaseIE(InfoExtractor): return info_dict -class QuicklineBaseIE(ZattooBaseIE): +class QuicklineBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'quickline' - _HOST_URL = 'https://mobiltv.quickline.com' + _HOST = 'mobiltv.quickline.com' class QuicklineIE(QuicklineBaseIE): - _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' % re.escape(QuicklineBaseIE._HOST) _TEST = { 'url': 'https://mobiltv.quickline.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', @@ -219,7 +222,7 @@ class QuicklineIE(QuicklineBaseIE): class QuicklineLiveIE(QuicklineBaseIE): - _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<id>[^/]+)' % re.escape(QuicklineBaseIE._HOST) _TEST = { 'url': 'https://mobiltv.quickline.com/watch/srf1', @@ -235,8 +238,18 @@ class QuicklineLiveIE(QuicklineBaseIE): return self._extract_video(channel_name, video_id, is_live=True) +class ZattooBaseIE(ZattooPlatformBaseIE): + _NETRC_MACHINE = 'zattoo' + _HOST = 'zattoo.com' + + +def _make_valid_url(tmpl, host): + return tmpl % re.escape(host) + + class ZattooIE(ZattooBaseIE): - _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' + _VALID_URL_TEMPLATE = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' + _VALID_URL = _make_valid_url(_VALID_URL_TEMPLATE, ZattooBaseIE._HOST) # Since regular videos are only available for 7 days and recorded videos # are only available for a specific user, we cannot have detailed tests. @@ -268,3 +281,153 @@ class ZattooLiveIE(ZattooBaseIE): def _real_extract(self, url): channel_name = video_id = self._match_id(url) return self._extract_video(channel_name, video_id, is_live=True) + + +class NetPlusIE(ZattooIE): + _NETRC_MACHINE = 'netplus' + _HOST = 'netplus.tv' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.netplus.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class MNetTVIE(ZattooIE): + _NETRC_MACHINE = 'mnettv' + _HOST = 'tvplus.m-net.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tvplus.m-net.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class WalyTVIE(ZattooIE): + _NETRC_MACHINE = 'walytv' + _HOST = 'player.waly.tv' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://player.waly.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class BBVTVIE(ZattooIE): + _NETRC_MACHINE = 'bbvtv' + _HOST = 'bbv-tv.net' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.bbv-tv.net/watch/abc/123-abc', + 'only_matching': True, + }] + + +class VTXTVIE(ZattooIE): + _NETRC_MACHINE = 'vtxtv' + _HOST = 'vtxtv.ch' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.vtxtv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class MyVisionTVIE(ZattooIE): + _NETRC_MACHINE = 'myvisiontv' + _HOST = 'myvisiontv.ch' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.myvisiontv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class GlattvisionTVIE(ZattooIE): + _NETRC_MACHINE = 'glattvisiontv' + _HOST = 'iptv.glattvision.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://iptv.glattvision.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class SAKTVIE(ZattooIE): + _NETRC_MACHINE = 'saktv' + _HOST = 'saktv.ch' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.saktv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class EWETVIE(ZattooIE): + _NETRC_MACHINE = 'ewetv' + _HOST = 'tvonline.ewe.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tvonline.ewe.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class QuantumTVIE(ZattooIE): + _NETRC_MACHINE = 'quantumtv' + _HOST = 'quantum-tv.com' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.quantum-tv.com/watch/abc/123-abc', + 'only_matching': True, + }] + + +class OsnatelTVIE(ZattooIE): + _NETRC_MACHINE = 'osnateltv' + _HOST = 'tvonline.osnatel.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tvonline.osnatel.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class EinsUndEinsTVIE(ZattooIE): + _NETRC_MACHINE = '1und1tv' + _HOST = '1und1.tv' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.1und1.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class SaltTVIE(ZattooIE): + _NETRC_MACHINE = 'salttv' + _HOST = 'tv.salt.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tv.salt.ch/watch/abc/123-abc', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index bb9020c91..afa3f6c47 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -15,6 +15,7 @@ from ..utils import ( try_get, unified_timestamp, update_url_query, + url_or_none, urljoin, ) @@ -67,8 +68,8 @@ class ZDFIE(ZDFBaseIE): def _extract_subtitles(src): subtitles = {} for caption in try_get(src, lambda x: x['captions'], list) or []: - subtitle_url = caption.get('uri') - if subtitle_url and isinstance(subtitle_url, compat_str): + subtitle_url = url_or_none(caption.get('uri')) + if subtitle_url: lang = caption.get('language', 'deu') subtitles.setdefault(lang, []).append({ 'url': subtitle_url, @@ -76,8 +77,8 @@ class ZDFIE(ZDFBaseIE): return subtitles def _extract_format(self, video_id, formats, format_urls, meta): - format_url = meta.get('url') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(meta.get('url')) + if not format_url: return if format_url in format_urls: return @@ -152,7 +153,8 @@ class ZDFIE(ZDFBaseIE): content, lambda x: x['teaserImageRef']['layouts'], dict) if layouts: for layout_key, layout_url in layouts.items(): - if not isinstance(layout_url, compat_str): + layout_url = url_or_none(layout_url) + if not layout_url: continue thumbnail = { 'url': layout_url, diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py new file mode 100644 index 000000000..3b16e703b --- /dev/null +++ b/youtube_dl/extractor/zype.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ZypeIE(InfoExtractor): + _VALID_URL = r'https?://player\.zype\.com/embed/(?P<id>[\da-fA-F]+)\.js\?.*?api_key=[^&]+' + _TEST = { + 'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false', + 'md5': 'eaee31d474c76a955bdaba02a505c595', + 'info_dict': { + 'id': '5b400b834b32992a310622b9', + 'ext': 'mp4', + 'title': 'Smoky Barbecue Favorites', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + } + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//player\.zype\.com/embed/[\da-fA-F]+\.js\?.*?api_key=.+?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex( + r'video_title\s*[:=]\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'title', group='value') + + m3u8_url = self._search_regex( + r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', webpage, + 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + thumbnail = self._search_regex( + r'poster\s*[:=]\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'thumbnail', + default=False, group='url') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/options.py b/youtube_dl/options.py index e83d546a0..e7d8e8910 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -841,11 +841,11 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--prefer-avconv', action='store_false', dest='prefer_ffmpeg', - help='Prefer avconv over ffmpeg for running the postprocessors (default)') + help='Prefer avconv over ffmpeg for running the postprocessors') postproc.add_option( '--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg', - help='Prefer ffmpeg over avconv for running the postprocessors') + help='Prefer ffmpeg over avconv for running the postprocessors (default)') postproc.add_option( '--ffmpeg-location', '--avconv-location', metavar='PATH', dest='ffmpeg_location', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 3ea1afcf3..5bcb00ac0 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -9,9 +9,6 @@ import re from .common import AudioConversionError, PostProcessor -from ..compat import ( - compat_subprocess_get_DEVNULL, -) from ..utils import ( encodeArgument, encodeFilename, @@ -77,7 +74,21 @@ class FFmpegPostProcessor(PostProcessor): def _determine_executables(self): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - prefer_ffmpeg = False + prefer_ffmpeg = True + + def get_ffmpeg_version(path): + ver = get_exe_version(path, args=['-version']) + if ver: + regexs = [ + r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1] + r'n([0-9.]+)$', # Arch Linux + # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/ + ] + for regex in regexs: + mobj = re.match(regex, ver) + if mobj: + ver = mobj.group(1) + return ver self.basename = None self.probe_basename = None @@ -85,7 +96,7 @@ class FFmpegPostProcessor(PostProcessor): self._paths = None self._versions = None if self._downloader: - prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False) + prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True) location = self._downloader.params.get('ffmpeg_location') if location is not None: if not os.path.exists(location): @@ -110,26 +121,25 @@ class FFmpegPostProcessor(PostProcessor): self._paths = dict( (p, os.path.join(location, p)) for p in programs) self._versions = dict( - (p, get_exe_version(self._paths[p], args=['-version'])) - for p in programs) + (p, get_ffmpeg_version(self._paths[p])) for p in programs) if self._versions is None: self._versions = dict( - (p, get_exe_version(p, args=['-version'])) for p in programs) + (p, get_ffmpeg_version(p)) for p in programs) self._paths = dict((p, p) for p in programs) - if prefer_ffmpeg: - prefs = ('ffmpeg', 'avconv') - else: + if prefer_ffmpeg is False: prefs = ('avconv', 'ffmpeg') + else: + prefs = ('ffmpeg', 'avconv') for p in prefs: if self._versions[p]: self.basename = p break - if prefer_ffmpeg: - prefs = ('ffprobe', 'avprobe') - else: + if prefer_ffmpeg is False: prefs = ('avprobe', 'ffprobe') + else: + prefs = ('ffprobe', 'avprobe') for p in prefs: if self._versions[p]: self.probe_basename = p @@ -152,27 +162,45 @@ class FFmpegPostProcessor(PostProcessor): return self._paths[self.probe_basename] def get_audio_codec(self, path): - if not self.probe_available: - raise PostProcessingError('ffprobe or avprobe not found. Please install one.') + if not self.probe_available and not self.available: + raise PostProcessingError('ffprobe/avprobe and ffmpeg/avconv not found. Please install one.') try: - cmd = [ - encodeFilename(self.probe_executable, True), - encodeArgument('-show_streams'), - encodeFilename(self._ffmpeg_filename_argument(path), True)] + if self.probe_available: + cmd = [ + encodeFilename(self.probe_executable, True), + encodeArgument('-show_streams')] + else: + cmd = [ + encodeFilename(self.executable, True), + encodeArgument('-i')] + cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) - handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE, stdin=subprocess.PIPE) - output = handle.communicate()[0] - if handle.wait() != 0: + self._downloader.to_screen( + '[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) + handle = subprocess.Popen( + cmd, stderr=subprocess.PIPE, + stdout=subprocess.PIPE, stdin=subprocess.PIPE) + stdout_data, stderr_data = handle.communicate() + expected_ret = 0 if self.probe_available else 1 + if handle.wait() != expected_ret: return None except (IOError, OSError): return None - audio_codec = None - for line in output.decode('ascii', 'ignore').split('\n'): - if line.startswith('codec_name='): - audio_codec = line.split('=')[1].strip() - elif line.strip() == 'codec_type=audio' and audio_codec is not None: - return audio_codec + output = (stdout_data if self.probe_available else stderr_data).decode('ascii', 'ignore') + if self.probe_available: + audio_codec = None + for line in output.split('\n'): + if line.startswith('codec_name='): + audio_codec = line.split('=')[1].strip() + elif line.strip() == 'codec_type=audio' and audio_codec is not None: + return audio_codec + else: + # Stream #FILE_INDEX:STREAM_INDEX[STREAM_ID](LANGUAGE): CODEC_TYPE: CODEC_NAME + mobj = re.search( + r'Stream\s*#\d+:\d+(?:\[0x[0-9a-f]+\])?(?:\([a-z]{3}\))?:\s*Audio:\s*([0-9a-z]+)', + output) + if mobj: + return mobj.group(1) return None def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): @@ -189,10 +217,13 @@ class FFmpegPostProcessor(PostProcessor): encodeArgument('-i'), encodeFilename(self._ffmpeg_filename_argument(path), True) ]) - cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] + - files_cmd + - [encodeArgument(o) for o in opts] + - [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) + cmd = [encodeFilename(self.executable, True), encodeArgument('-y')] + # avconv does not have repeat option + if self.basename == 'ffmpeg': + cmd += [encodeArgument('-loglevel'), encodeArgument('repeat+info')] + cmd += (files_cmd + + [encodeArgument(o) for o in opts] + + [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) if self._downloader.params.get('verbose', False): self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd)) @@ -379,14 +410,16 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): # Don't copy the existing subtitles, we may be running the # postprocessor a second time '-map', '-0:s', + # Don't copy Apple TV chapters track, bin_data (see #19042, #19024, + # https://trac.ffmpeg.org/ticket/6016) + '-map', '-0:d', ] if information['ext'] == 'mp4': opts += ['-c:s', 'mov_text'] for (i, lang) in enumerate(sub_langs): opts.extend(['-map', '%d:0' % (i + 1)]) - lang_code = ISO639Utils.short2long(lang) - if lang_code is not None: - opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) + lang_code = ISO639Utils.short2long(lang) or lang + opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) temp_filename = prepend_extension(filename, 'temp') self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6a3199fb9..f5a0bb4b0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ from .compat import ( compat_HTMLParser, compat_basestring, compat_chr, + compat_cookiejar, compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, compat_expanduser, @@ -49,7 +50,6 @@ from .compat import ( compat_os_name, compat_parse_qs, compat_shlex_quote, - compat_socket_create_connection, compat_str, compat_struct_pack, compat_struct_unpack, @@ -82,7 +82,7 @@ def register_socks_protocols(): compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -184,6 +184,7 @@ DATE_FORMATS_MONTH_FIRST.extend([ ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" +JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>' def preferredencoding(): @@ -881,13 +882,51 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): kwargs['strict'] = True hc = http_class(*args, **compat_kwargs(kwargs)) source_address = ydl_handler._params.get('source_address') + if source_address is not None: + # This is to workaround _create_connection() from socket where it will try all + # address data from getaddrinfo() including IPv6. This filters the result from + # getaddrinfo() based on the source_address value. + # This is based on the cpython socket.create_connection() function. + # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 + def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): + host, port = address + err = None + addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 + ip_addrs = [addr for addr in addrs if addr[0] == af] + if addrs and not ip_addrs: + ip_version = 'v4' if af == socket.AF_INET else 'v6' + raise socket.error( + "No remote IP%s addresses available for connect, can't use '%s' as source address" + % (ip_version, source_address[0])) + for res in ip_addrs: + af, socktype, proto, canonname, sa = res + sock = None + try: + sock = socket.socket(af, socktype, proto) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: + sock.settimeout(timeout) + sock.bind(source_address) + sock.connect(sa) + err = None # Explicitly break reference cycle + return sock + except socket.error as _: + err = _ + if sock is not None: + sock.close() + if err is not None: + raise err + else: + raise socket.error('getaddrinfo returns an empty list') + if hasattr(hc, '_create_connection'): + hc._create_connection = _create_connection sa = (source_address, 0) if hasattr(hc, 'source_address'): # Python 2.7+ hc.source_address = sa else: # Python 2.6 def _hc_connect(self, *args, **kwargs): - sock = compat_socket_create_connection( + sock = _create_connection( (self.host, self.port), self.timeout, sa) if is_https: self.sock = ssl.wrap_socket( @@ -1101,6 +1140,33 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): req, **kwargs) +class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + # Store session cookies with `expires` set to 0 instead of an empty + # string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires) + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): def __init__(self, cookiejar=None): compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) @@ -1802,7 +1868,7 @@ def urljoin(base, path): path = path.decode('utf-8') if not isinstance(path, compat_str) or not path: return None - if re.match(r'^(?:https?:)?//', path): + if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): return path if isinstance(base, bytes): base = base.decode('utf-8') @@ -1865,6 +1931,13 @@ def strip_or_none(v): return None if v is None else v.strip() +def url_or_none(url): + if not url or not isinstance(url, compat_str): + return None + url = url.strip() + return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + + def parse_duration(s): if not isinstance(s, compat_basestring): return None @@ -2281,7 +2354,7 @@ def parse_age_limit(s): def strip_jsonp(code): return re.sub( r'''(?sx)^ - (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+) + (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*) (?:\s*&&\s*(?P=func_name))? \s*\(\s*(?P<callback_data>.*)\);? \s*?(?://[^\n]*)*$''', @@ -2432,7 +2505,7 @@ def parse_codecs(codecs_str): vcodec, acodec = None, None for full_codec in splited_codecs: codec = full_codec.split('.')[0] - if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'): + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01'): if not vcodec: vcodec = full_codec elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): @@ -2895,6 +2968,7 @@ class ISO639Utils(object): 'gv': 'glv', 'ha': 'hau', 'he': 'heb', + 'iw': 'heb', # Replaced by he in 1989 revision 'hi': 'hin', 'ho': 'hmo', 'hr': 'hrv', @@ -2904,6 +2978,7 @@ class ISO639Utils(object): 'hz': 'her', 'ia': 'ina', 'id': 'ind', + 'in': 'ind', # Replaced by id in 1989 revision 'ie': 'ile', 'ig': 'ibo', 'ii': 'iii', @@ -3018,6 +3093,7 @@ class ISO639Utils(object): 'wo': 'wol', 'xh': 'xho', 'yi': 'yid', + 'ji': 'yid', # Replaced by yi in 1989 revision 'yo': 'yor', 'za': 'zha', 'zh': 'zho', @@ -3561,7 +3637,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): setattr(self, '%s_open' % type, lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: meth(r, proxy, type)) - return compat_urllib_request.ProxyHandler.__init__(self, proxies) + compat_urllib_request.ProxyHandler.__init__(self, proxies) def proxy_open(self, req, proxy, type): req_proxy = req.headers.get('Ytdl-request-proxy') @@ -3903,8 +3979,12 @@ def write_xattr(path, key, value): def random_birthday(year_field, month_field, day_field): + start_date = datetime.date(1950, 1, 1) + end_date = datetime.date(1995, 12, 31) + offset = random.randint(0, (end_date - start_date).days) + random_date = start_date + datetime.timedelta(offset) return { - year_field: str(random.randint(1950, 1995)), - month_field: str(random.randint(1, 12)), - day_field: str(random.randint(1, 31)), + year_field: str(random_date.year), + month_field: str(random_date.month), + day_field: str(random_date.day), } diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e72f42cf2..ea1d5a4a5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.11' +__version__ = '2019.02.18'