diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7241840c5..8914569b6 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.06*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.06** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.06 +[debug] youtube-dl version 2017.02.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} @@ -50,9 +50,11 @@ $ youtube-dl -v - Single video: https://youtu.be/BaW_jenozKc - Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc +Note that **youtube-dl does not support sites dedicated to [copyright infringement](https://github.com/rg3/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. In order for site support request to be accepted all provided example URLs should not violate any copyrights. + --- ### Description of your *issue*, suggested solution and other information Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. -If work on your *issue* required an account credentials please provide them or explain how one can obtain them. +If work on your *issue* requires account credentials please provide them or explain how one can obtain them. diff --git a/.github/ISSUE_TEMPLATE_tmpl.md b/.github/ISSUE_TEMPLATE_tmpl.md index a5e6a4233..df79503d3 100644 --- a/.github/ISSUE_TEMPLATE_tmpl.md +++ b/.github/ISSUE_TEMPLATE_tmpl.md @@ -50,9 +50,11 @@ $ youtube-dl -v - Single video: https://youtu.be/BaW_jenozKc - Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc +Note that **youtube-dl does not support sites dedicated to [copyright infringement](https://github.com/rg3/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. In order for site support request to be accepted all provided example URLs should not violate any copyrights. + --- ### Description of your *issue*, suggested solution and other information Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. -If work on your *issue* required an account credentials please provide them or explain how one can obtain them. +If work on your *issue* requires account credentials please provide them or explain how one can obtain them. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index f24bb4b09..46fa26f02 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -10,8 +10,13 @@ - [ ] At least skimmed through [adding new extractor tutorial](https://github.com/rg3/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/rg3/youtube-dl#youtube-dl-coding-conventions) sections - [ ] [Searched](https://github.com/rg3/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests +### In order to be accepted and merged into youtube-dl each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check one of the following options: +- [ ] I am the original author of this code and I am willing to release it under [Unlicense](http://unlicense.org/) +- [ ] I am not the original author of this code but it is in public domain or released under [Unlicense](http://unlicense.org/) (provide reliable evidence) + ### What is the purpose of your *pull request*? - [ ] Bug fix +- [ ] Improvement - [ ] New extractor - [ ] New feature diff --git a/.gitignore b/.gitignore index a802c75a1..9ce4b5e2d 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,11 @@ updates_key.pem *.m4a *.m4v *.mp3 +*.3gp +*.wav +*.ape +*.mkv +*.swf *.part *.swp test/testdata diff --git a/AUTHORS b/AUTHORS index 890c827a0..f2875d504 100644 --- a/AUTHORS +++ b/AUTHORS @@ -26,7 +26,7 @@ Albert Kim Pierre Rudloff Huarong Huo Ismael Mejía -Steffan 'Ruirize' James +Steffan Donal Andras Elso Jelle van der Waa Marcin Cieślak @@ -179,3 +179,25 @@ Jakub Adam Wieczorek Aleksandar Topuzović Nehal Patel Rob van Bekkum +Petr Zvoníček +Pratyush Singh +Aleksander Nitecki +Sebastian Blunt +Matěj Cepl +Xie Yanbo +Philip Xu +John Hawkinson +Rich Leeper +Zhong Jianxin +Thor77 +Mattias Wadman +Arjan Verwer +Costy Petrisor +Logan B +Alex Seiler +Vijay Singh +Paul Hartmann +Stephen Chen +Fabian Stahl +Bagira +Odd Stråbø diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fbf0ab7e8..d606eab0e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,7 +12,7 @@ $ youtube-dl -v [debug] Proxy map: {} ... ``` -**Do not post screenshots of verbose log only plain text is acceptable.** +**Do not post screenshots of verbose logs; only plain text is acceptable.** The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. @@ -46,7 +46,7 @@ Make sure that someone has not already opened the issue you're trying to open. S ### Why are existing options not enough? -Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. +Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. ### Is there enough context in your bug report? @@ -58,7 +58,7 @@ We are then presented with a very complicated request when the original problem Some of our users seem to think there is a limit of issues they can or should open. There is no limit of issues they can or should open. While it may seem appealing to be able to dump all your issues into one ticket, that means that someone who solves one of your issues cannot mark the issue as closed. Typically, reporting a bunch of issues leads to the ticket lingering since nobody wants to attack that behemoth, until someone mercifully splits the issue into multiple ones. -In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, Whitehouse podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of youtube-dl that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service. +In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, White house podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of youtube-dl that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service. ### Is anyone going to need the feature? @@ -66,7 +66,7 @@ Only post features that you (or an incapacitated friend you can personally talk ### Is your question about youtube-dl? -It may sound strange, but some bug reports we receive are completely unrelated to youtube-dl and relate to a different or even the reporter's own application. Please make sure that you are actually using youtube-dl. If you are using a UI for youtube-dl, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for youtube-dl fails in some way you believe is related to youtube-dl, by all means, go ahead and report the bug. +It may sound strange, but some bug reports we receive are completely unrelated to youtube-dl and relate to a different, or even the reporter's own, application. Please make sure that you are actually using youtube-dl. If you are using a UI for youtube-dl, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for youtube-dl fails in some way you believe is related to youtube-dl, by all means, go ahead and report the bug. # DEVELOPER INSTRUCTIONS @@ -85,16 +85,16 @@ To run the test, simply invoke your favorite test runner, or execute a test file If you want to create a build of youtube-dl yourself, you'll need * python -* make (both GNU make and BSD make are supported) +* make (only GNU make is supported) * pandoc * zip * nosetests ### Adding support for a new site -If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. +If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](README.md#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. -After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`): +After you have ensured this site is distributing its content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) 2. Check out the source code with: @@ -124,7 +124,7 @@ After you have ensured this site is distributing it's content legally, you can f 'id': '42', 'ext': 'mp4', 'title': 'Video title goes here', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', # TODO more properties, either as: # * A value # * MD5 checksum; start the string with md5: @@ -167,19 +167,19 @@ In any case, thank you very much for your contributions! This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. -Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros. +Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with a fix incorporated, all the previous versions become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say, some non rolling release distros may never receive an update at all. ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl: +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) - `url` (media download URL) or `formats` -In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken. +In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example @@ -199,7 +199,7 @@ Assume at this point `meta`'s layout is: } ``` -Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like: +Assume you want to extract `summary` and put it into the resulting info dict as `description`. Since `description` is an optional meta field you should be ready that this key may be missing from the `meta` dict, so that you should extract it like: ```python description = meta.get('summary') # correct @@ -211,7 +211,7 @@ and not like: description = meta['summary'] # incorrect ``` -The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data). +The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some later time but with the former approach extraction will just go ahead with `description` set to `None` which is perfectly fine (remember `None` is equivalent to the absence of data). Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance: @@ -231,21 +231,21 @@ description = self._search_regex( webpage, 'description', default=None) ``` -On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present. +On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that may or may not be present. ### Provide fallbacks -When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable. +When extracting metadata try to do so from multiple sources. For example if `title` is present in several places, try extracting from at least some of them. This makes it more future-proof in case some of the sources become unavailable. #### Example -Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like: +Say `meta` from the previous example has a `title` and you are about to extract it. Since `title` is a mandatory meta field you should end up with something like: ```python title = meta['title'] ``` -If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected. +If `title` disappears from `meta` in future due to some changes on the hoster's side the extraction would fail since `title` is mandatory. That's expected. Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario: @@ -282,7 +282,7 @@ title = self._search_regex( webpage, 'title', group='title') ``` -Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute: +Note how you tolerate potential changes in the `style` attribute's value or switch from using double quotes to single for `class` attribute: The code definitely should not look like: diff --git a/ChangeLog b/ChangeLog index 9fd78cdda..947590b94 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,1049 @@ +version + +Core ++ Add --playlist-random to shuffle playlists (#11889, #11901) + +Extractors ++ [infoq] Add audio only formats (#11565) +* [youtube] Fix ytsearch when cookies are provided (#11924) ++ [bilibili] Support new Bangumi URLs (#11845) + + +version 2017.02.01 + +Extractors ++ [facebook] Add another fallback extraction scenario (#11926) +* [prosiebensat1] Fix extraction of descriptions (#11810, #11929) +- [crunchyroll] Remove ScaledBorderAndShadow settings (#9028) ++ [vimeo] Extract upload timestamp ++ [vimeo] Extract license (#8726, #11880) ++ [nrk:series] Add support for series (#11571, #11711) ++ [elpais] Fix extraction for some URLs (#11765) + + +version 2017.01.31 + +Core ++ [compat] Add compat_etree_register_namespace + +Extractors +* [youtube] Fix extraction for domainless player URLs (#11890, #11891, #11892, + #11894, #11895, #11897, #11900, #11903, #11904, #11906, #11907, #11909, + #11913, #11914, #11915, #11916, #11917, #11918, #11919) ++ [vimeo] Extract both mixed and separated DASH formats ++ [ruutu] Extract DASH formats +* [itv] Fix extraction for python 2.6 + + +version 2017.01.29 + +Core +* [extractor/common] Fix initialization template (#11605, #11825) ++ [extractor/common] Document fragment_base_url and fragment's path fields +* [extractor/common] Fix duration per DASH segment (#11868) ++ Introduce --autonumber-start option for initial value of %(autonumber)s + template (#727, #2702, #9362, #10457, #10529, #11862) + +Extractors ++ [azmedien:playlist] Add support for topic and themen playlists (#11817) +* [npo] Fix subtitles extraction ++ [itv] Extract subtitles ++ [itv] Add support for itv.com (#9240) ++ [mtv81] Add support for mtv81.com (#7619) ++ [vlive] Add support for channels (#11826) ++ [kaltura] Add fallback for fileExt ++ [kaltura] Improve uploader_id extraction ++ [konserthusetplay] Add support for rspoplay.se (#11828) + + +version 2017.01.28 + +Core +* [utils] Improve parse_duration + +Extractors +* [crunchyroll] Improve series and season metadata extraction (#11832) +* [soundcloud] Improve formats extraction and extract audio bitrate ++ [soundcloud] Extract HLS formats +* [soundcloud] Fix track URL extraction (#11852) ++ [twitch:vod] Expand URL regular expressions (#11846) +* [aenetworks] Fix season episodes extraction (#11669) ++ [tva] Add support for videos.tva.ca (#11842) +* [jamendo] Improve and extract more metadata (#11836) ++ [disney] Add support for Disney sites (#7409, #11801, #4975, #11000) +* [vevo] Remove request to old API and catch API v2 errors ++ [cmt,mtv,southpark] Add support for episode URLs (#11837) ++ [youtube] Add fallback for duration extraction (#11841) + + +version 2017.01.25 + +Extractors ++ [openload] Fallback video extension to mp4 ++ [extractor/generic] Add support for Openload embeds (#11536, #11812) +* [srgssr] Fix rts video extraction (#11831) ++ [afreecatv:global] Add support for afreeca.tv (#11807) ++ [crackle] Extract vtt subtitles ++ [crackle] Extract multiple resolutions for thumbnails ++ [crackle] Add support for mobile URLs ++ [konserthusetplay] Extract subtitles (#11823) ++ [konserthusetplay] Add support for HLS videos (#11823) +* [vimeo:review] Fix config URL extraction (#11821) + + +version 2017.01.24 + +Extractors +* [pluralsight] Fix extraction (#11820) ++ [nextmedia] Add support for NextTV (壹電視) +* [24video] Fix extraction (#11811) +* [youtube:playlist] Fix nonexistent and private playlist detection (#11604) ++ [chirbit] Extract uploader (#11809) + + +version 2017.01.22 + +Extractors ++ [pornflip] Add support for pornflip.com (#11556, #11795) +* [chaturbate] Fix extraction (#11797, #11802) ++ [azmedien] Add support for AZ Medien sites (#11784, #11785) ++ [nextmedia] Support redirected URLs ++ [vimeo:channel] Extract videos' titles for playlist entries (#11796) ++ [youtube] Extract episode metadata (#9695, #11774) ++ [cspan] Support Ustream embedded videos (#11547) ++ [1tv] Add support for HLS videos (#11786) +* [uol] Fix extraction (#11770) +* [mtv] Relax triforce feed regular expression (#11766) + + +version 2017.01.18 + +Extractors +* [bilibili] Fix extraction (#11077) ++ [canalplus] Add fallback for video id (#11764) +* [20min] Fix extraction (#11683, #11751) +* [imdb] Extend URL regular expression (#11744) ++ [naver] Add support for tv.naver.com links (#11743) + + +version 2017.01.16 + +Core +* [options] Apply custom config to final composite configuration (#11741) +* [YoutubeDL] Improve protocol auto determining (#11720) + +Extractors +* [xiami] Relax URL regular expressions +* [xiami] Improve track metadata extraction (#11699) ++ [limelight] Check hand-make direct HTTP links ++ [limelight] Add support for direct HTTP links at video.llnw.net (#11737) ++ [brightcove] Recognize another player ID pattern (#11688) ++ [niconico] Support login via cookies (#7968) +* [yourupload] Fix extraction (#11601) ++ [beam:live] Add support for beam.pro live streams (#10702, #11596) +* [vevo] Improve geo restriction detection ++ [dramafever] Add support for URLs with language code (#11714) +* [cbc] Improve playlist support (#11704) + + +version 2017.01.14 + +Core ++ [common] Add ability to customize akamai manifest host ++ [utils] Add more date formats + +Extractors +- [mtv] Eliminate _transform_rtmp_url +* [mtv] Generalize triforce mgid extraction ++ [cmt] Add support for full episodes and video clips (#11623) ++ [mitele] Extract DASH formats ++ [ooyala] Add support for videos with embedToken (#11684) +* [mixcloud] Fix extraction (#11674) +* [openload] Fix extraction (#10408) +* [tv4] Improve extraction (#11698) +* [freesound] Fix and improve extraction (#11602) ++ [nick] Add support for beta.nick.com (#11655) +* [mtv,cc] Use HLS by default with native HLS downloader (#11641) +* [mtv] Fix non-HLS extraction + + +version 2017.01.10 + +Extractors +* [youtube] Fix extraction (#11663, #11664) ++ [inc] Add support for inc.com (#11277, #11647) ++ [youtube] Add itag 212 (#11575) ++ [egghead:course] Add support for egghead.io courses + + +version 2017.01.08 + +Core +* Fix "invalid escape sequence" errors under Python 3.6 (#11581) + +Extractors ++ [hitrecord] Add support for hitrecord.org (#10867, #11626) +- [videott] Remove extractor +* [swrmediathek] Improve extraction +- [sharesix] Remove extractor +- [aol:features] Remove extractor +* [sendtonews] Improve info extraction +* [3sat,phoenix] Fix extraction (#11619) +* [comedycentral/mtv] Add support for HLS videos (#11600) +* [discoverygo] Fix JSON data parsing (#11219, #11522) + + +version 2017.01.05 + +Extractors ++ [zdf] Fix extraction (#11055, #11063) +* [pornhub:playlist] Improve extraction (#11594) ++ [cctv] Add support for ncpa-classic.com (#11591) ++ [tunein] Add support for embeds (#11579) + + +version 2017.01.02 + +Extractors +* [cctv] Improve extraction (#879, #6753, #8541) ++ [nrktv:episodes] Add support for episodes (#11571) ++ [arkena] Add support for video.arkena.com (#11568) + + +version 2016.12.31 + +Core ++ Introduce --config-location option for custom configuration files (#6745, + #10648) + +Extractors ++ [twitch] Add support for player.twitch.tv (#11535, #11537) ++ [videa] Add support for videa.hu (#8181, #11133) +* [vk] Fix postlive videos extraction +* [vk] Extract from playerParams (#11555) +- [freevideo] Remove extractor (#11515) ++ [showroomlive] Add support for showroom-live.com (#11458) +* [xhamster] Fix duration extraction (#11549) +* [rtve:live] Fix extraction (#11529) +* [brightcove:legacy] Improve embeds detection (#11523) ++ [twitch] Add support for rechat messages (#11524) +* [acast] Fix audio and timestamp extraction (#11521) + + +version 2016.12.22 + +Core +* [extractor/common] Improve detection of video-only formats in m3u8 + manifests (#11507) + +Extractors ++ [theplatform] Pass geo verification headers to SMIL request (#10146) ++ [viu] Pass geo verification headers to auth request +* [rtl2] Extract more formats and metadata +* [vbox7] Skip malformed JSON-LD (#11501) +* [uplynk] Force downloading using native HLS downloader (#11496) ++ [laola1] Add support for another extraction scenario (#11460) + + +version 2016.12.20 + +Core +* [extractor/common] Improve fragment URL construction for DASH media +* [extractor/common] Fix codec information extraction for mixed audio/video + DASH media (#11490) + +Extractors +* [vbox7] Fix extraction (#11494) ++ [uktvplay] Add support for uktvplay.uktv.co.uk (#11027) ++ [piksel] Add support for player.piksel.com (#11246) ++ [vimeo] Add support for DASH formats +* [vimeo] Fix extraction for HLS formats (#11490) +* [kaltura] Fix wrong widget ID in some cases (#11480) ++ [nrktv:direkte] Add support for live streams (#11488) +* [pbs] Fix extraction for geo restricted videos (#7095) +* [brightcove:new] Skip widevine classic videos ++ [viu] Add support for viu.com (#10607, #11329) + + +version 2016.12.18 + +Core ++ [extractor/common] Recognize DASH formats in html5 media entries + +Extractors ++ [ccma] Add support for ccma.cat (#11359) +* [laola1tv] Improve extraction ++ [laola1tv] Add support embed URLs (#11460) +* [nbc] Fix extraction for MSNBC videos (#11466) +* [twitch] Adapt to new videos pages URL schema (#11469) ++ [meipai] Add support for meipai.com (#10718) +* [jwplatform] Improve subtitles and duration extraction ++ [ondemandkorea] Add support for ondemandkorea.com (#10772) ++ [vvvvid] Add support for vvvvid.it (#5915) + + +version 2016.12.15 + +Core ++ [utils] Add convenience urljoin + +Extractors ++ [openload] Recognize oload.tv URLs (#10408) ++ [facebook] Recognize .onion URLs (#11443) +* [vlive] Fix extraction (#11375, #11383) ++ [canvas] Extract DASH formats ++ [melonvod] Add support for vod.melon.com (#11419) + + +version 2016.12.12 + +Core ++ [utils] Add common user agents map ++ [common] Recognize HLS manifests that contain video only formats (#11394) + +Extractors ++ [dplay] Use Safari user agent for HLS (#11418) ++ [facebook] Detect login required error message +* [facebook] Improve video selection (#11390) ++ [canalplus] Add another video id pattern (#11399) +* [mixcloud] Relax URL regular expression (#11406) +* [ctvnews] Relax URL regular expression (#11394) ++ [rte] Capture and output error message (#7746, #10498) ++ [prosiebensat1] Add support for DASH formats +* [srgssr] Improve extraction for geo restricted videos (#11089) +* [rts] Improve extraction for geo restricted videos (#4989) + + +version 2016.12.09 + +Core +* [socks] Fix error reporting (#11355) + +Extractors +* [openload] Fix extraction (#10408) +* [pandoratv] Fix extraction (#11023) ++ [telebruxelles] Add support for emission URLs +* [telebruxelles] Extract all formats ++ [bloomberg] Add another video id regular expression (#11371) +* [fusion] Update ooyala id regular expression (#11364) ++ [1tv] Add support for playlists (#11335) +* [1tv] Improve extraction (#11335) ++ [aenetworks] Extract more formats (#11321) ++ [thisoldhouse] Recognize /tv-episode/ URLs (#11271) + + +version 2016.12.01 + +Extractors +* [soundcloud] Update client id (#11327) +* [ruutu] Detect DRM protected videos ++ [liveleak] Add support for youtube embeds (#10688) +* [spike] Fix full episodes support (#11312) +* [comedycentral] Fix full episodes support +* [normalboots] Rewrite in terms of JWPlatform (#11184) +* [teamfourstar] Rewrite in terms of JWPlatform (#11184) +- [screenwavemedia] Remove extractor (#11184) + + +version 2016.11.27 + +Extractors ++ [webcaster] Add support for webcaster.pro ++ [azubu] Add support for azubu.uol.com.br (#11305) +* [viki] Prefer hls formats +* [viki] Fix rtmp formats extraction (#11255) +* [puls4] Relax URL regular expression (#11267) +* [vevo] Improve artist extraction (#10911) +* [mitele] Relax URL regular expression and extract more metadata (#11244) ++ [cbslocal] Recognize New York site (#11285) ++ [youtube:playlist] Pass disable_polymer in URL query (#11193) + + +version 2016.11.22 + +Extractors +* [hellporno] Fix video extension extraction (#11247) ++ [hellporno] Add support for hellporno.net (#11247) ++ [amcnetworks] Recognize more BBC America URLs (#11263) +* [funnyordie] Improve extraction (#11208) +* [extractor/generic] Improve limelight embeds support +- [crunchyroll] Remove ScaledBorderAndShadow from ASS subtitles (#8207, #9028) +* [bandcamp] Fix free downloads extraction and extract all formats (#11067) +* [twitter:card] Relax URL regular expression (#11225) ++ [tvanouvelles] Add support for tvanouvelles.ca (#10616) + + +version 2016.11.18 + +Extractors +* [youtube:live] Relax URL regular expression (#11164) +* [openload] Fix extraction (#10408, #11122) +* [vlive] Prefer locale over language for subtitles id (#11203) + + +version 2016.11.14.1 + +Core ++ [downoader/fragment,f4m,hls] Respect HTTP headers from info dict +* [extractor/common] Fix media templates with Bandwidth substitution pattern in + MPD manifests (#11175) +* [extractor/common] Improve thumbnail extraction from JSON-LD + +Extractors ++ [nrk] Workaround geo restriction ++ [nrk] Improve error detection and messages ++ [afreecatv] Add support for vod.afreecatv.com (#11174) +* [cda] Fix and improve extraction (#10929, #10936) +* [plays] Fix extraction (#11165) +* [eagleplatform] Fix extraction (#11160) ++ [audioboom] Recognize /posts/ URLs (#11149) + + +version 2016.11.08.1 + +Extractors +* [espn:article] Fix support for espn.com articles +* [franceculture] Fix extraction (#11140) + + +version 2016.11.08 + +Extractors +* [tmz:article] Fix extraction (#11052) +* [espn] Fix extraction (#11041) +* [mitele] Fix extraction after website redesign (#10824) +- [ard] Remove age restriction check (#11129) +* [generic] Improve support for pornhub.com embeds (#11100) ++ [generic] Add support for redtube.com embeds (#11099) ++ [generic] Add support for drtuber.com embeds (#11098) ++ [redtube] Add support for embed URLs ++ [drtuber] Add support for embed URLs ++ [yahoo] Improve content id extraction (#11088) +* [toutv] Relax URL regular expression (#11121) + + +version 2016.11.04 + +Core +* [extractor/common] Tolerate malformed RESOLUTION attribute in m3u8 + manifests (#11113) +* [downloader/ism] Fix AVC Decoder Configuration Record + +Extractors ++ [fox9] Add support for fox9.com (#11110) ++ [anvato] Extract more metadata and improve formats extraction +* [vodlocker] Improve removed videos detection (#11106) ++ [vzaar] Add support for vzaar.com (#11093) ++ [vice] Add support for uplynk preplay videos (#11101) +* [tubitv] Fix extraction (#11061) ++ [shahid] Add support for authentication (#11091) ++ [radiocanada] Add subtitles support (#11096) ++ [generic] Add support for ISM manifests + + +version 2016.11.02 + +Core ++ Add basic support for Smooth Streaming protocol (#8118, #10969) +* Improve MPD manifest base URL extraction (#10909, #11079) +* Fix --match-filter for int-like strings (#11082) + +Extractors ++ [mva] Add support for ISM formats ++ [msn] Add support for ISM formats ++ [onet] Add support for ISM formats ++ [tvp] Add support for ISM formats ++ [nicknight] Add support for nicknight sites (#10769) + + +version 2016.10.30 + +Extractors +* [facebook] Improve 1080P video detection (#11073) +* [imgur] Recognize /r/ URLs (#11071) +* [beeg] Fix extraction (#11069) +* [openload] Fix extraction (#10408) +* [gvsearch] Modernize and fix search request (#11051) +* [adultswim] Fix extraction (#10979) ++ [nobelprize] Add support for nobelprize.org (#9999) +* [hornbunny] Fix extraction (#10981) +* [tvp] Improve video id extraction (#10585) + + +version 2016.10.26 + +Extractors ++ [rentv] Add support for ren.tv (#10620) ++ [ard] Detect unavailable videos (#11018) +* [vk] Fix extraction (#11022) + + +version 2016.10.25 + +Core +* Running youtube-dl in the background is fixed (#10996, #10706, #955) + +Extractors ++ [jamendo] Add support for jamendo.com (#10132, #10736) ++ [pandatv] Add support for panda.tv (#10736) ++ [dotsub] Support Vimeo embed (#10964) +* [litv] Fix extraction ++ [vimeo] Delegate ondemand redirects to ondemand extractor (#10994) +* [vivo] Fix extraction (#11003) ++ [twitch:stream] Add support for rebroadcasts (#10995) +* [pluralsight] Fix subtitles conversion (#10990) + + +version 2016.10.21.1 + +Extractors ++ [pluralsight] Process all clip URLs (#10984) + + +version 2016.10.21 + +Core +- Disable thumbnails embedding in mkv ++ Add support for Comcast multiple-system operator (#10819) + +Extractors +* [pluralsight] Adapt to new API (#10972) +* [openload] Fix extraction (#10408, #10971) ++ [natgeo] Extract m3u8 formats (#10959) + + +version 2016.10.19 + +Core ++ [utils] Expose PACKED_CODES_RE ++ [extractor/common] Extract non smil wowza mpd manifests ++ [extractor/common] Detect f4m audio-only formats + +Extractors +* [vidzi] Fix extraction (#10908, #10952) +* [urplay] Fix subtitles extraction ++ [urplay] Add support for urskola.se (#10915) ++ [orf] Add subtitles support (#10939) +* [youtube] Fix --no-playlist behavior for youtu.be/id URLs (#10896) +* [nrk] Relax URL regular expression (#10928) ++ [nytimes] Add support for podcasts (#10926) +* [pluralsight] Relax URL regular expression (#10941) + + +version 2016.10.16 + +Core +* [postprocessor/ffmpeg] Return correct filepath and ext in updated information + in FFmpegExtractAudioPP (#10879) + +Extractors ++ [ruutu] Add support for supla.fi (#10849) ++ [theoperaplatform] Add support for theoperaplatform.eu (#10914) +* [lynda] Fix height for prioritized streams ++ [lynda] Add fallback extraction scenario +* [lynda] Switch to https (#10916) ++ [huajiao] New extractor (#10917) +* [cmt] Fix mgid extraction (#10813) ++ [safari:course] Add support for techbus.safaribooksonline.com +* [orf:tvthek] Fix extraction and modernize (#10898) +* [chirbit] Fix extraction of user profile pages +* [carambatv] Fix extraction +* [canalplus] Fix extraction for some videos +* [cbsinteractive] Fix extraction for cnet.com +* [parliamentliveuk] Lower case URLs are now recognized (#10912) + + +version 2016.10.12 + +Core ++ Support HTML media elements without child nodes +* [Makefile] Support for GNU make < 4 is fixed; BSD make dropped (#9387) + +Extractors +* [dailymotion] Fix extraction (#10901) +* [vimeo:review] Fix extraction (#10900) +* [nhl] Correctly handle invalid formats (#10713) +* [footyroom] Fix extraction (#10810) +* [abc.net.au:iview] Fix for standalone (non series) videos (#10895) ++ [hbo] Add support for episode pages (#10892) +* [allocine] Fix extraction (#10860) ++ [nextmedia] Recognize action news on AppleDaily +* [lego] Improve info extraction and bypass geo restriction (#10872) + + +version 2016.10.07 + +Extractors ++ [iprima] Detect geo restriction +* [facebook] Fix video extraction (#10846) ++ [commonprotocols] Support direct MMS links (#10838) ++ [generic] Add support for multiple vimeo embeds (#10862) ++ [nzz] Add support for nzz.ch (#4407) ++ [npo] Detect geo restriction ++ [npo] Add support for 2doc.nl (#10842) ++ [lego] Add support for lego.com (#10369) ++ [tonline] Add support for t-online.de (#10376) +* [techtalks] Relax URL regular expression (#10840) +* [youtube:live] Extend URL regular expression (#10839) ++ [theweatherchannel] Add support for weather.com (#7188) ++ [thisoldhouse] Add support for thisoldhouse.com (#10837) ++ [nhl] Add support for wch2016.com (#10833) +* [pornoxo] Use JWPlatform to improve metadata extraction + + +version 2016.10.02 + +Core +* Fix possibly lost extended attributes during post-processing ++ Support pyxattr as well as python-xattr for --xattrs and + --xattr-set-filesize (#9054) + +Extractors ++ [jwplatform] Support DASH streams in JWPlayer ++ [jwplatform] Support old-style JWPlayer playlists ++ [byutv:event] Add extractor +* [periscope:user] Fix extraction (#10820) +* [dctp] Fix extraction (#10734) ++ [instagram] Extract video dimensions (#10790) ++ [tvland] Extend URL regular expression (#10812) ++ [vgtv] Add support for tv.aftonbladet.se (#10800) +- [aftonbladet] Remove extractor +* [vk] Fix timestamp and view count extraction (#10760) ++ [vk] Add support for running and finished live streams (#10799) ++ [leeco] Recognize more Le Sports URLs (#10794) ++ [instagram] Extract comments (#10788) ++ [ketnet] Extract mzsource formats (#10770) +* [limelight:media] Improve HTTP formats extraction + + +version 2016.09.27 + +Core ++ Add hdcore query parameter to akamai f4m formats ++ Delegate HLS live streams downloading to ffmpeg ++ Improved support for HTML5 subtitles + +Extractors ++ [vk] Add support for dailymotion embeds (#10661) +* [promptfile] Fix extraction (#10634) +* [kaltura] Speed up embed regular expressions (#10764) ++ [npo] Add support for anderetijden.nl (#10754) ++ [prosiebensat1] Add support for advopedia sites +* [mwave] Relax URL regular expression (#10735, #10748) +* [prosiebensat1] Fix playlist support (#10745) ++ [prosiebensat1] Add support for sat1gold sites (#10745) ++ [cbsnews:livevideo] Fix extraction and extract m3u8 formats ++ [brightcove:new] Add support for live streams +* [soundcloud] Generalize playlist entries extraction (#10733) ++ [mtv] Add support for new URL schema (#8169, #9808) +* [einthusan] Fix extraction (#10714) ++ [twitter] Support Periscope embeds (#10737) ++ [openload] Support subtitles (#10625) + + +version 2016.09.24 + +Core ++ Add support for watchTVeverywhere.com authentication provider based MSOs for + Adobe Pass authentication (#10709) + +Extractors ++ [soundcloud:playlist] Provide video id for early playlist entries (#10733) ++ [prosiebensat1] Add support for kabeleinsdoku (#10732) +* [cbs] Extract info from thunder videoPlayerService (#10728) +* [openload] Fix extraction (#10408) ++ [ustream] Support the new HLS streams (#10698) ++ [ooyala] Extract all HLS formats ++ [cartoonnetwork] Add support for Adobe Pass authentication ++ [soundcloud] Extract license metadata ++ [fox] Add support for Adobe Pass authentication (#8584) ++ [tbs] Add support for Adobe Pass authentication (#10642, #10222) ++ [trutv] Add support for Adobe Pass authentication (#10519) ++ [turner] Add support for Adobe Pass authentication + + +version 2016.09.19 + +Extractors ++ [crunchyroll] Check if already authenticated (#10700) +- [twitch:stream] Remove fallback to profile extraction when stream is offline +* [thisav] Improve title extraction (#10682) +* [vyborymos] Improve station info extraction + + +version 2016.09.18 + +Core ++ Introduce manifest_url and fragments fields in formats dictionary for + fragmented media ++ Provide manifest_url field for DASH segments, HLS and HDS ++ Provide fragments field for DASH segments +* Rework DASH segments downloader to use fragments field ++ Add helper method for Wowza Streaming Engine formats extraction + +Extractors ++ [vyborymos] Add extractor for vybory.mos.ru (#10692) ++ [xfileshare] Add title regular expression for streamin.to (#10646) ++ [globo:article] Add support for multiple videos (#10653) ++ [thisav] Recognize HTML5 videos (#10447) +* [jwplatform] Improve JWPlayer detection ++ [mangomolo] Add support for Mangomolo embeds ++ [toutv] Add support for authentication (#10669) +* [franceinter] Fix upload date extraction +* [tv4] Fix HLS and HDS formats extraction (#10659) + + +version 2016.09.15 + +Core +* Improve _hidden_inputs ++ Introduce improved explicit Adobe Pass support ++ Add --ap-mso to provide multiple-system operator identifier ++ Add --ap-username to provide MSO account username ++ Add --ap-password to provide MSO account password ++ Add --ap-list-mso to list all supported MSOs ++ Add support for Rogers Cable multiple-system operator (#10606) + +Extractors +* [crunchyroll] Fix authentication (#10655) +* [twitch] Fix API calls (#10654, #10660) ++ [bellmedia] Add support for more Bell Media Television sites +* [franceinter] Fix extraction (#10538, #2105) +* [kuwo] Improve error detection (#10650) ++ [go] Add support for free full episodes (#10439) +* [bilibili] Fix extraction for specific videos (#10647) +* [nhk] Fix extraction (#10633) +* [kaltura] Improve audio detection +* [kaltura] Skip chun format ++ [vimeo:ondemand] Pass Referer along with embed URL (#10624) ++ [nbc] Add support for NBC Olympics (#10361) + + +version 2016.09.11.1 + +Extractors ++ [tube8] Extract categories and tags (#10579) ++ [pornhub] Extract categories and tags (#10499) +* [openload] Temporary fix (#10408) ++ [foxnews] Add support Fox News articles (#10598) +* [viafree] Improve video id extraction (#10615) +* [iwara] Fix extraction after relaunch (#10462, #3215) ++ [tfo] Add extractor for tfo.org +* [lrt] Fix audio extraction (#10566) +* [9now] Fix extraction (#10561) ++ [canalplus] Add support for c8.fr (#10577) +* [newgrounds] Fix uploader extraction (#10584) ++ [polskieradio:category] Add support for category lists (#10576) ++ [ketnet] Add extractor for ketnet.be (#10343) ++ [canvas] Add support for een.be (#10605) ++ [telequebec] Add extractor for telequebec.tv (#1999) +* [parliamentliveuk] Fix extraction (#9137) + + +version 2016.09.08 + +Extractors ++ [jwplatform] Extract height from format label ++ [yahoo] Extract Brightcove Legacy Studio embeds (#9345) +* [videomore] Fix extraction (#10592) +* [foxgay] Fix extraction (#10480) ++ [rmcdecouverte] Add extractor for rmcdecouverte.bfmtv.com (#9709) +* [gamestar] Fix metadata extraction (#10479) +* [puls4] Fix extraction (#10583) ++ [cctv] Add extractor for CCTV and CNTV (#8153) ++ [lci] Add extractor for lci.fr (#10573) ++ [wat] Extract DASH formats ++ [viafree] Improve video id detection (#10569) ++ [trutv] Add extractor for trutv.com (#10519) ++ [nick] Add support for nickelodeon.nl (#10559) ++ [abcotvs:clips] Add support for clips.abcotvs.com ++ [abcotvs] Add support for ABC Owned Television Stations sites (#9551) ++ [miaopai] Add extractor for miaopai.com (#10556) +* [gamestar] Fix metadata extraction (#10479) ++ [bilibili] Add support for episodes (#10190) ++ [tvnoe] Add extractor for tvnoe.cz (#10524) + + +version 2016.09.04.1 + +Core +* In DASH downloader if the first segment fails, abort the whole download + process to prevent throttling (#10497) ++ Add support for --skip-unavailable-fragments and --fragment retries in + hlsnative downloader (#10165, #10448). ++ Add support for --skip-unavailable-fragments in DASH downloader ++ Introduce --skip-unavailable-fragments option for fragment based downloaders + that allows to skip fragments unavailable due to a HTTP error +* Fix extraction of video/audio entries with src attribute in + _parse_html5_media_entries (#10540) + +Extractors +* [theplatform] Relax URL regular expression (#10546) +* [youtube:playlist] Extend URL regular expression +* [rottentomatoes] Delegate extraction to internetvideoarchive extractor +* [internetvideoarchive] Extract all formats +* [pornvoisines] Fix extraction (#10469) +* [rottentomatoes] Fix extraction (#10467) +* [espn] Extend URL regular expression (#10549) +* [vimple] Extend URL regular expression (#10547) +* [youtube:watchlater] Fix extraction (#10544) +* [youjizz] Fix extraction (#10437) ++ [foxnews] Add support for FoxNews Insider (#10445) ++ [fc2] Recognize Flash player URLs (#10512) + + +version 2016.09.03 + +Core +* Restore usage of NAME attribute from EXT-X-MEDIA tag for formats codes in + _extract_m3u8_formats (#10522) +* Handle semicolon in mimetype2ext + +Extractors ++ [youtube] Add support for rental videos' previews (#10532) +* [youtube:playlist] Fallback to video extraction for video/playlist URLs when + no playlist is actually served (#10537) ++ [drtv] Add support for dr.dk/nyheder (#10536) ++ [facebook:plugins:video] Add extractor (#10530) ++ [go] Add extractor for *.go.com sites +* [adobepass] Check for authz_token expiration (#10527) +* [nytimes] improve extraction +* [thestar] Fix extraction (#10465) +* [glide] Fix extraction (#10478) +- [exfm] Remove extractor (#10482) +* [youporn] Fix categories and tags extraction (#10521) ++ [curiositystream] Add extractor for app.curiositystream.com +- [thvideo] Remove extractor (#10464) +* [movingimage] Fix for the new site name (#10466) ++ [cbs] Add support for once formats (#10515) +* [limelight] Skip ism snd duplicate manifests ++ [porncom] Extract categories and tags (#10510) ++ [facebook] Extract timestamp (#10508) ++ [yahoo] Extract more formats + + +version 2016.08.31 + +Extractors +* [soundcloud] Fix URL regular expression to avoid clashes with sets (#10505) +* [bandcamp:album] Fix title extraction (#10455) +* [pyvideo] Fix extraction (#10468) ++ [ctv] Add support for tsn.ca, bnn.ca and thecomedynetwork.ca (#10016) +* [9c9media] Extract more metadata +* [9c9media] Fix multiple stacks extraction (#10016) +* [adultswim] Improve video info extraction (#10492) +* [vodplatform] Improve embed regular expression +- [played] Remove extractor (#10470) ++ [tbs] Add extractor for tbs.com and tntdrama.com (#10222) ++ [cartoonnetwork] Add extractor for cartoonnetwork.com (#10110) +* [adultswim] Rework in terms of turner extractor +* [cnn] Rework in terms of turner extractor +* [nba] Rework in terms of turner extractor ++ [turner] Add base extractor for Turner Broadcasting System based sites +* [bilibili] Fix extraction (#10375) +* [openload] Fix extraction (#10408) + + +version 2016.08.28 + +Core ++ Add warning message that ffmpeg doesn't support SOCKS +* Improve thumbnail sorting ++ Extract formats from #EXT-X-MEDIA tags in _extract_m3u8_formats +* Fill IV with leading zeros for IVs shorter than 16 octets in hlsnative ++ Add ac-3 to the list of audio codecs in parse_codecs + +Extractors +* [periscope:user] Fix extraction (#10453) +* [douyutv] Fix extraction (#10153, #10318, #10444) ++ [nhk:vod] Add extractor for www3.nhk.or.jp on demand (#4437, #10424) +- [trutube] Remove extractor (#10438) ++ [usanetwork] Add extractor for usanetwork.com +* [crackle] Fix extraction (#10333) +* [spankbang] Fix description and uploader extraction (#10339) +* [discoverygo] Detect cable provider restricted videos (#10425) ++ [cbc] Add support for watch.cbc.ca +* [kickstarter] Silent the warning for og:description (#10415) +* [mtvservices:embedded] Fix extraction for the new 'edge' player (#10363) + + +version 2016.08.24.1 + +Extractors ++ [pluralsight] Add support for subtitles (#9681) + + +version 2016.08.24 + +Extractors +* [youtube] Fix authentication (#10392) +* [openload] Fix extraction (#10408) ++ [bravotv] Add support for Adobe Pass (#10407) +* [bravotv] Fix clip info extraction (#10407) +* [eagleplatform] Improve embedded videos detection (#10409) +* [awaan] Fix extraction +* [mtvservices:embedded] Update config URL ++ [abc:iview] Add extractor (#6148) + + +version 2016.08.22 + +Core +* Improve formats and subtitles extension auto calculation ++ Recognize full unit names in parse_filesize ++ Add support for m3u8 manifests in HTML5 multimedia tags +* Fix octal/hexadecimal number detection in js_to_json + +Extractors ++ [ivi] Add support for 720p and 1080p ++ [charlierose] Add new extractor (#10382) +* [1tv] Fix extraction (#9249) +* [twitch] Renew authentication +* [kaltura] Improve subtitles extension calculation ++ [zingmp3] Add support for video clips +* [zingmp3] Fix extraction (#10041) +* [kaltura] Improve subtitles extraction (#10279) +* [cultureunplugged] Fix extraction (#10330) ++ [cnn] Add support for money.cnn.com (#2797) +* [cbsnews] Fix extraction (#10362) +* [cbs] Fix extraction (#10393) ++ [litv] Support 'promo' URLs (#10385) +* [snotr] Fix extraction (#10338) +* [n-tv.de] Fix extraction (#10331) +* [globo:article] Relax URL and video id regular expressions (#10379) + + +version 2016.08.19 + +Core +- Remove output template description from --help +* Recognize lowercase units in parse_filesize + +Extractors ++ [porncom] Add extractor for porn.com (#2251, #10251) ++ [generic] Add support for DBTV embeds +* [vk:wallpost] Fix audio extraction for new site layout +* [vk] Fix authentication ++ [hgtvcom:show] Add extractor for hgtv.com shows (#10365) ++ [discoverygo] Add support for another GO network sites + + +version 2016.08.17 + +Core ++ Add _get_netrc_login_info + +Extractors +* [mofosex] Extract all formats (#10335) ++ [generic] Add support for vbox7 embeds ++ [vbox7] Add support for embed URLs ++ [viafree] Add extractor (#10358) ++ [mtg] Add support for viafree URLs (#10358) +* [theplatform] Extract all subtitles per language ++ [xvideos] Fix HLS extraction (#10356) ++ [amcnetworks] Add extractor ++ [bbc:playlist] Add support for pagination (#10349) ++ [fxnetworks] Add extractor (#9462) +* [cbslocal] Fix extraction for SendtoNews-based videos +* [sendtonews] Fix extraction +* [jwplatform] Extract video id from JWPlayer data +- [zippcast] Remove extractor (#10332) ++ [viceland] Add extractor (#8799) ++ [adobepass] Add base extractor for Adobe Pass Authentication +* [life:embed] Improve extraction +* [vgtv] Detect geo restricted videos (#10348) ++ [uplynk] Add extractor +* [xiami] Fix extraction (#10342) + + +version 2016.08.13 + +Core +* Show progress for curl external downloader +* Forward more options to curl external downloader + +Extractors +* [pbs] Fix description extraction +* [franceculture] Fix extraction (#10324) +* [pornotube] Fix extraction (#10322) +* [4tube] Fix metadata extraction (#10321) +* [imgur] Fix width and height extraction (#10325) +* [expotv] Improve extraction ++ [vbox7] Fix extraction (#10309) +- [tapely] Remove extractor (#10323) +* [muenchentv] Fix extraction (#10313) ++ [24video] Add support for .me and .xxx TLDs +* [24video] Fix comment count extraction +* [sunporno] Add support for embed URLs +* [sunporno] Fix metadata extraction (#10316) ++ [hgtv] Add extractor for hgtv.ca (#3999) +- [pbs] Remove request to unavailable API ++ [pbs] Add support for high quality HTTP formats ++ [crunchyroll] Add support for HLS formats (#10301) + + +version 2016.08.12 + +Core +* Subtitles are now written as is. Newline conversions are disabled. (#10268) ++ Recognize more formats in unified_timestamp + +Extractors +- [goldenmoustache] Remove extractor (#10298) +* [drtuber] Improve title extraction +* [drtuber] Make dislike count optional (#10297) +* [chirbit] Fix extraction (#10296) +* [francetvinfo] Relax URL regular expression +* [rtlnl] Relax URL regular expression (#10282) +* [formula1] Relax URL regular expression (#10283) +* [wat] Improve extraction (#10281) +* [ctsnews] Fix extraction + + +version 2016.08.10 + +Core +* Make --metadata-from-title non fatal when title does not match the pattern +* Introduce options for randomized sleep before each download + --min-sleep-interval and --max-sleep-interval (#9930) +* Respect default in _search_json_ld + +Extractors ++ [uol] Add extractor for uol.com.br (#4263) +* [rbmaradio] Fix extraction and extract all formats (#10242) ++ [sonyliv] Add extractor for sonyliv.com (#10258) +* [aparat] Fix extraction +* [cwtv] Extract HTTP formats ++ [rozhlas] Add extractor for prehravac.rozhlas.cz (#10253) +* [kuwo:singer] Fix extraction + + +version 2016.08.07 + +Core ++ Add support for TV Parental Guidelines ratings in parse_age_limit ++ Add decode_png (#9706) ++ Add support for partOfTVSeries in JSON-LD +* Lower master M3U8 manifest preference for better format sorting + +Extractors ++ [discoverygo] Add extractor (#10245) +* [flipagram] Make JSON-LD extraction non fatal +* [generic] Make JSON-LD extraction non fatal ++ [bbc] Add support for morph embeds (#10239) +* [tnaflixnetworkbase] Improve title extraction +* [tnaflix] Fix metadata extraction (#10249) +* [fox] Fix theplatform release URL query +* [openload] Fix extraction (#9706) +* [bbc] Skip duplicate manifest URLs +* [bbc] Improve format code ++ [bbc] Add support for DASH and F4M +* [bbc] Improve format sorting and listing +* [bbc] Improve playlist extraction ++ [pokemon] Add extractor (#10093) ++ [condenast] Add fallback scenario for video info extraction + + version 2016.08.06 Core diff --git a/Makefile b/Makefile index 354052c50..9d1ddc9d1 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.jpg *.png CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part* *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete find . -name "*.class" -delete @@ -12,7 +12,7 @@ SHAREDIR ?= $(PREFIX)/share PYTHON ?= /usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local -SYSCONFDIR != if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi +SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi) install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish install -d $(DESTDIR)$(BINDIR) @@ -90,7 +90,7 @@ fish-completion: youtube-dl.fish lazy-extractors: youtube_dl/extractor/lazy_extractors.py -_EXTRACTOR_FILES != find youtube_dl/extractor -iname '*.py' -and -not -iname 'lazy_extractors.py' +_EXTRACTOR_FILES = $(shell find youtube_dl/extractor -iname '*.py' -and -not -iname 'lazy_extractors.py') youtube_dl/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) $(PYTHON) devscripts/make_lazy_extractors.py $@ diff --git a/README.md b/README.md index a9f3001a6..2ee00f515 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.ex You can also use pip: - sudo pip install --upgrade youtube-dl + sudo -H pip install --upgrade youtube-dl This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information. @@ -44,11 +44,7 @@ Or with [MacPorts](https://www.macports.org/): Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html). # DESCRIPTION -**youtube-dl** is a command-line program to download videos from -YouTube.com and a few more sites. It requires the Python interpreter, version -2.6, 2.7, or 3.2+, and it is not platform specific. It should work on -your Unix box, on Windows or on Mac OS X. It is released to the public domain, -which means you can modify it, redistribute it or use it however you like. +**youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on Mac OS X. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. youtube-dl [OPTIONS] URL [URL...] @@ -84,6 +80,9 @@ which means you can modify it, redistribute it or use it however you like. configuration in ~/.config/youtube- dl/config (%APPDATA%/youtube-dl/config.txt on Windows) + --config-location PATH Location of the configuration file; either + the path to the config or its containing + directory. --flat-playlist Do not extract the videos of a playlist, only list them. --mark-watched Mark videos watched (YouTube only) @@ -98,16 +97,13 @@ which means you can modify it, redistribute it or use it however you like. string (--proxy "") for direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds --source-address IP Client-side IP address to bind to - (experimental) -4, --force-ipv4 Make all connections via IPv4 - (experimental) -6, --force-ipv6 Make all connections via IPv6 - (experimental) --geo-verification-proxy URL Use this proxy to verify the IP address for some geo-restricted sites. The default proxy specified by --proxy (or none, if the options is not present) is used for the - actual downloading. (experimental) + actual downloading. ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) @@ -138,23 +134,23 @@ which means you can modify it, redistribute it or use it however you like. COUNT views --max-views COUNT Do not download any videos with more than COUNT views - --match-filter FILTER Generic video filter (experimental). - Specify any key (see help for -o for a list - of available keys) to match if the key is - present, !key to check if the key is not - present,key > NUMBER (like "comment_count > - 12", also works with >=, <, <=, !=, =) to - compare against a number, and & to require - multiple matches. Values which are not - known are excluded unless you put a - question mark (?) after the operator.For - example, to only match videos that have - been liked more than 100 times and disliked - less than 50 times (or the dislike - functionality is not available at the given - service), but who also have a description, - use --match-filter "like_count > 100 & - dislike_count + NUMBER (like "comment_count > 12", also + works with >=, <, <=, !=, =) to compare + against a number, and & to require multiple + matches. Values which are not known are + excluded unless you put a question mark (?) + after the operator.For example, to only + match videos that have been liked more than + 100 times and disliked less than 50 times + (or the dislike functionality is not + available at the given service), but who + also have a description, use --match-filter + "like_count > 100 & dislike_count login password ``` @@ -542,13 +550,13 @@ Available for the media that is a track or a part of a music album: - `disc_number`: Number of the disc or other physical medium the track belongs to - `release_year`: Year (YYYY) when the album was released -Each aforementioned sequence when referenced in output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by particular extractor, such sequences will be replaced with `NA`. +Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with `NA`. -For example for `-o %(title)s-%(id)s.%(ext)s` and mp4 video with title `youtube-dl test video` and id `BaW_jenozKcj` this will result in a `youtube-dl test video-BaW_jenozKcj.mp4` file created in the current directory. +For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `youtube-dl test video` and id `BaW_jenozKcj`, this will result in a `youtube-dl test video-BaW_jenozKcj.mp4` file created in the current directory. -Output template can also contain arbitrary hierarchical path, e.g. `-o '%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s'` that will result in downloading each video in a directory corresponding to this path template. Any missing directory will be automatically created for you. +Output templates can also contain arbitrary hierarchical path, e.g. `-o '%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s'` which will result in downloading each video in a directory corresponding to this path template. Any missing directory will be automatically created for you. -To specify percent literal in output template use `%%`. To output to stdout use `-o -`. +To use percent literals in an output template use `%%`. To output to stdout use `-o -`. The current default template is `%(title)s-%(id)s.%(ext)s`. @@ -556,7 +564,7 @@ In some cases, you don't want special characters such as 中, spaces, or &, such #### Output template and Windows batch files -If you are using output template inside a Windows batch file then you must escape plain percent characters (`%`) by doubling, so that `-o "%(title)s-%(id)s.%(ext)s"` should become `-o "%%(title)s-%%(id)s.%%(ext)s"`. However you should not touch `%`'s that are not plain characters, e.g. environment variables for expansion should stay intact: `-o "C:\%HOMEPATH%\Desktop\%%(title)s.%%(ext)s"`. +If you are using an output template inside a Windows batch file then you must escape plain percent characters (`%`) by doubling, so that `-o "%(title)s-%(id)s.%(ext)s"` should become `-o "%%(title)s-%%(id)s.%%(ext)s"`. However you should not touch `%`'s that are not plain characters, e.g. environment variables for expansion should stay intact: `-o "C:\%HOMEPATH%\Desktop\%%(title)s.%%(ext)s"`. #### Output template examples @@ -589,7 +597,7 @@ $ youtube-dl -o - BaW_jenozKc By default youtube-dl tries to download the best available quality, i.e. if you want the best quality you **don't need** to pass any special options, youtube-dl will guess it for you by **default**. -But sometimes you may want to download in a different format, for example when you are on a slow or intermittent connection. The key mechanism for achieving this is so called *format selection* based on which you can explicitly specify desired format, select formats based on some criterion or criteria, setup precedence and much more. +But sometimes you may want to download in a different format, for example when you are on a slow or intermittent connection. The key mechanism for achieving this is so-called *format selection* based on which you can explicitly specify desired format, select formats based on some criterion or criteria, setup precedence and much more. The general syntax for format selection is `--format FORMAT` or shorter `-f FORMAT` where `FORMAT` is a *selector expression*, i.e. an expression that describes format or formats you would like to download. @@ -597,21 +605,21 @@ The general syntax for format selection is `--format FORMAT` or shorter `-f FORM The simplest case is requesting a specific format, for example with `-f 22` you can download the format with format code equal to 22. You can get the list of available format codes for particular video using `--list-formats` or `-F`. Note that these format codes are extractor specific. -You can also use a file extension (currently `3gp`, `aac`, `flv`, `m4a`, `mp3`, `mp4`, `ogg`, `wav`, `webm` are supported) to download best quality format of particular file extension served as a single file, e.g. `-f webm` will download best quality format with `webm` extension served as a single file. +You can also use a file extension (currently `3gp`, `aac`, `flv`, `m4a`, `mp3`, `mp4`, `ogg`, `wav`, `webm` are supported) to download the best quality format of a particular file extension served as a single file, e.g. `-f webm` will download the best quality format with the `webm` extension served as a single file. -You can also use special names to select particular edge case format: - - `best`: Select best quality format represented by single file with video and audio - - `worst`: Select worst quality format represented by single file with video and audio - - `bestvideo`: Select best quality video only format (e.g. DASH video), may not be available - - `worstvideo`: Select worst quality video only format, may not be available - - `bestaudio`: Select best quality audio only format, may not be available - - `worstaudio`: Select worst quality audio only format, may not be available +You can also use special names to select particular edge case formats: + - `best`: Select the best quality format represented by a single file with video and audio. + - `worst`: Select the worst quality format represented by a single file with video and audio. + - `bestvideo`: Select the best quality video-only format (e.g. DASH video). May not be available. + - `worstvideo`: Select the worst quality video-only format. May not be available. + - `bestaudio`: Select the best quality audio only-format. May not be available. + - `worstaudio`: Select the worst quality audio only-format. May not be available. -For example, to download worst quality video only format you can use `-f worstvideo`. +For example, to download the worst quality video-only format you can use `-f worstvideo`. If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes. Note that slash is left-associative, i.e. formats on the left hand side are preferred, for example `-f 22/17/18` will download format 22 if it's available, otherwise it will download format 17 if it's available, otherwise it will download format 18 if it's available, otherwise it will complain that no suitable formats are available for download. -If you want to download several formats of the same video use comma as a separator, e.g. `-f 22,17,18` will download all these three formats, of course if they are available. Or more sophisticated example combined with precedence feature `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. +If you want to download several formats of the same video use a comma as a separator, e.g. `-f 22,17,18` will download all these three formats, of course if they are available. Or a more sophisticated example combined with the precedence feature: `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can also filter the video formats by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). @@ -630,18 +638,18 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin - `acodec`: Name of the audio codec in use - `vcodec`: Name of the video codec in use - `container`: Name of the container format - - `protocol`: The protocol that will be used for the actual download, lower-case. `http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `m3u8`, or `m3u8_native` + - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format -Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by video hoster. +Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster. Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. -You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv installed), for example `-f bestvideo+bestaudio` will download best video only format, best audio only format and mux them together with ffmpeg/avconv. +You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv installed), for example `-f bestvideo+bestaudio` will download the best video-only format, the best audio-only format and mux them together with ffmpeg/avconv. Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`. -Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see [#5447](https://github.com/rg3/youtube-dl/issues/5447), [#5456](https://github.com/rg3/youtube-dl/issues/5456)). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some DASH formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. +Since the end of April 2015 and version 2015.04.26, youtube-dl uses `-f bestvideo+bestaudio/best` as the default format selection (see [#5447](https://github.com/rg3/youtube-dl/issues/5447), [#5456](https://github.com/rg3/youtube-dl/issues/5456)). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some DASH formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download the best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. @@ -656,12 +664,16 @@ $ youtube-dl -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' # Download best format available but not better that 480p $ youtube-dl -f 'bestvideo[height<=480]+bestaudio/best[height<=480]' -# Download best video only format but no bigger that 50 MB +# Download best video only format but no bigger than 50 MB $ youtube-dl -f 'best[filesize<50M]' # Download best format available via direct link over HTTP/HTTPS protocol $ youtube-dl -f '(bestvideo+bestaudio/best)[protocol^=http]' + +# Download the best video format and the best audio format without merging them +$ youtube-dl -f 'bestvideo,bestaudio' -o '%(title)s.f%(format_id)s.%(ext)s' ``` +Note that in the last example, an output template is recommended as bestvideo and bestaudio may have the same file name. # VIDEO SELECTION @@ -716,7 +728,7 @@ Add a file exclusion for `youtube-dl.exe` in Windows Defender settings. YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos. -If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update. +If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging people](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update. ### I'm getting an error when trying to use output template: `error: using output template conflicts with using title, video ID or auto number` @@ -732,7 +744,7 @@ Most people asking this question are not aware that youtube-dl now defaults to d ### I get HTTP error 402 when trying to download a video. What's this? -Apparently YouTube requires you to pass a CAPTCHA test if you download too much. We're [considering to provide a way to let you solve the CAPTCHA](https://github.com/rg3/youtube-dl/issues/154), but at the moment, your best course of action is pointing a webbrowser to the youtube URL, solving the CAPTCHA, and restart youtube-dl. +Apparently YouTube requires you to pass a CAPTCHA test if you download too much. We're [considering to provide a way to let you solve the CAPTCHA](https://github.com/rg3/youtube-dl/issues/154), but at the moment, your best course of action is pointing a web browser to the youtube URL, solving the CAPTCHA, and restart youtube-dl. ### Do I need any other programs? @@ -742,11 +754,11 @@ Videos or video formats streamed via RTMP protocol can only be downloaded when [ ### I have downloaded a video but how can I play it? -Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). +Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org/) or [mplayer](http://www.mplayerhq.hu/). -### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. +### I extracted a video URL with `-g`, but it does not play on another machine / in my web browser. -It depends a lot on the service. In many cases, requests for the video (to download/play it) must come from the same IP address and with the same cookies. Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl. +It depends a lot on the service. In many cases, requests for the video (to download/play it) must come from the same IP address and with the same cookies and/or HTTP headers. Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl. You can also get necessary cookies and HTTP headers from JSON output obtained with `--dump-json`. It may be beneficial to use IPv6; in some cases, the restrictions are only applied to IPv4. Some services (sometimes only for a subset of videos) do not restrict the video URL by IP address, cookie, or user-agent, but these are the exception rather than the rule. @@ -824,10 +836,42 @@ Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the opt ### How do I pass cookies to youtube-dl? -Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. +Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. + +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). + +Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, Mac OS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. Another use case is working around [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) some websites require you to solve in particular cases in order to get access (e.g. YouTube, CloudFlare). +### How do I stream directly to media player? + +You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](http://www.videolan.org/) can be achieved with: + + youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - + +### How do I download only new videos from a playlist? + +Use download-archive feature. With this feature you should initially download the complete playlist with `--download-archive /path/to/download/archive/file.txt` that will record identifiers of all the videos in a special file. Each subsequent run with the same `--download-archive` will download only new videos and skip all videos that have been downloaded before. Note that only successful downloads are recorded in the file. + +For example, at first, + + youtube-dl --download-archive archive.txt "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" + +will download the complete `PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re` playlist and create a file `archive.txt`. Each subsequent run will only download new videos if any: + + youtube-dl --download-archive archive.txt "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" + +### Should I add `--hls-prefer-native` into my config? + +When youtube-dl detects an HLS video, it can download it either with the built-in downloader or ffmpeg. Since many HLS streams are slightly invalid and ffmpeg/youtube-dl each handle some invalid cases better than the other, there is an option to switch the downloader if needed. + +When youtube-dl knows that one particular downloader works better for a given website, that downloader will be picked. Otherwise, youtube-dl will pick the best downloader for general compatibility, which at the moment happens to be ffmpeg. This choice may change in future versions of youtube-dl, with improvements of the built-in downloader and/or ffmpeg. + +In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](http://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader. + +If you put either `--hls-prefer-native` or `--hls-prefer-ffmpeg` into your configuration, a different subset of videos will fail to download correctly. Instead, it is much better to [file an issue](https://yt-dl.org/bug) or a pull request which details why the native or the ffmpeg HLS downloader is a better choice for your use case. + ### Can you add support for this anime video site, or site which shows current movies for free? As a matter of policy (as well as legality), youtube-dl does not include support for services that specialize in infringing copyright. As a rule of thumb, if you cannot easily find a video that the service is quite obviously allowed to distribute (i.e. that has been uploaded by the creator, the creator's distributor, or is published under a free license), the service is probably unfit for inclusion to youtube-dl. @@ -858,7 +902,7 @@ If you want to find out whether a given URL is supported, simply call youtube-dl # Why do I need to go through that much red tape when filing bugs? -Before we had the issue template, despite our extensive [bug reporting instructions](#bugs), about 80% of the issue reports we got were useless, for instance because people used ancient versions hundreds of releases old, because of simple syntactic errors (not in youtube-dl but in general shell usage), because the problem was alrady reported multiple times before, because people did not actually read an error message, even if it said "please install ffmpeg", because people did not mention the URL they were trying to download and many more simple, easy-to-avoid problems, many of whom were totally unrelated to youtube-dl. +Before we had the issue template, despite our extensive [bug reporting instructions](#bugs), about 80% of the issue reports we got were useless, for instance because people used ancient versions hundreds of releases old, because of simple syntactic errors (not in youtube-dl but in general shell usage), because the problem was already reported multiple times before, because people did not actually read an error message, even if it said "please install ffmpeg", because people did not mention the URL they were trying to download and many more simple, easy-to-avoid problems, many of whom were totally unrelated to youtube-dl. youtube-dl is an open-source project manned by too few volunteers, so we'd rather spend time fixing bugs where we are certain none of those simple problems apply, and where we can be reasonably confident to be able to reproduce the issue without asking the reporter repeatedly. As such, the output of `youtube-dl -v YOUR_URL_HERE` is really all that's required to file an issue. The issue template also guides you through some basic steps you can do, such as checking that your version of youtube-dl is current. @@ -879,16 +923,16 @@ To run the test, simply invoke your favorite test runner, or execute a test file If you want to create a build of youtube-dl yourself, you'll need * python -* make (both GNU make and BSD make are supported) +* make (only GNU make is supported) * pandoc * zip * nosetests ### Adding support for a new site -If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. +If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](README.md#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. -After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`): +After you have ensured this site is distributing its content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) 2. Check out the source code with: @@ -918,7 +962,7 @@ After you have ensured this site is distributing it's content legally, you can f 'id': '42', 'ext': 'mp4', 'title': 'Video title goes here', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', # TODO more properties, either as: # * A value # * MD5 checksum; start the string with md5: @@ -961,19 +1005,19 @@ In any case, thank you very much for your contributions! This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. -Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros. +Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with a fix incorporated, all the previous versions become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say, some non rolling release distros may never receive an update at all. ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl: +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) - `url` (media download URL) or `formats` -In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken. +In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example @@ -993,7 +1037,7 @@ Assume at this point `meta`'s layout is: } ``` -Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like: +Assume you want to extract `summary` and put it into the resulting info dict as `description`. Since `description` is an optional meta field you should be ready that this key may be missing from the `meta` dict, so that you should extract it like: ```python description = meta.get('summary') # correct @@ -1005,7 +1049,7 @@ and not like: description = meta['summary'] # incorrect ``` -The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data). +The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some later time but with the former approach extraction will just go ahead with `description` set to `None` which is perfectly fine (remember `None` is equivalent to the absence of data). Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance: @@ -1025,21 +1069,21 @@ description = self._search_regex( webpage, 'description', default=None) ``` -On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present. +On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that may or may not be present. ### Provide fallbacks -When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable. +When extracting metadata try to do so from multiple sources. For example if `title` is present in several places, try extracting from at least some of them. This makes it more future-proof in case some of the sources become unavailable. #### Example -Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like: +Say `meta` from the previous example has a `title` and you are about to extract it. Since `title` is a mandatory meta field you should end up with something like: ```python title = meta['title'] ``` -If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected. +If `title` disappears from `meta` in future due to some changes on the hoster's side the extraction would fail since `title` is mandatory. That's expected. Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario: @@ -1076,7 +1120,7 @@ title = self._search_regex( webpage, 'title', group='title') ``` -Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute: +Note how you tolerate potential changes in the `style` attribute's value or switch from using double quotes to single for `class` attribute: The code definitely should not look like: @@ -1105,7 +1149,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) ``` -Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L128-L278). For a start, if you want to intercept youtube-dl's output, set a `logger` object. +Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L129-L279). For a start, if you want to intercept youtube-dl's output, set a `logger` object. Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: @@ -1146,7 +1190,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: # BUGS -Bugs and suggestions should be reported at: . Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). +Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). **Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` @@ -1162,7 +1206,7 @@ $ youtube-dl -v [debug] Proxy map: {} ... ``` -**Do not post screenshots of verbose log only plain text is acceptable.** +**Do not post screenshots of verbose logs; only plain text is acceptable.** The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. @@ -1196,7 +1240,7 @@ Make sure that someone has not already opened the issue you're trying to open. S ### Why are existing options not enough? -Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. +Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. ### Is there enough context in your bug report? @@ -1208,7 +1252,7 @@ We are then presented with a very complicated request when the original problem Some of our users seem to think there is a limit of issues they can or should open. There is no limit of issues they can or should open. While it may seem appealing to be able to dump all your issues into one ticket, that means that someone who solves one of your issues cannot mark the issue as closed. Typically, reporting a bunch of issues leads to the ticket lingering since nobody wants to attack that behemoth, until someone mercifully splits the issue into multiple ones. -In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, Whitehouse podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of youtube-dl that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service. +In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, White house podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of youtube-dl that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service. ### Is anyone going to need the feature? @@ -1216,7 +1260,7 @@ Only post features that you (or an incapacitated friend you can personally talk ### Is your question about youtube-dl? -It may sound strange, but some bug reports we receive are completely unrelated to youtube-dl and relate to a different or even the reporter's own application. Please make sure that you are actually using youtube-dl. If you are using a UI for youtube-dl, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for youtube-dl fails in some way you believe is related to youtube-dl, by all means, go ahead and report the bug. +It may sound strange, but some bug reports we receive are completely unrelated to youtube-dl and relate to a different, or even the reporter's own, application. Please make sure that you are actually using youtube-dl. If you are using a UI for youtube-dl, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for youtube-dl fails in some way you believe is related to youtube-dl, by all means, go ahead and report the bug. # COPYRIGHT diff --git a/devscripts/bash-completion.py b/devscripts/bash-completion.py index ce68f26f9..3d1391334 100755 --- a/devscripts/bash-completion.py +++ b/devscripts/bash-completion.py @@ -25,5 +25,6 @@ def build_completion(opt_parser): filled_template = template.replace("{{flags}}", " ".join(opts_flag)) f.write(filled_template) + parser = youtube_dl.parseOpts()[0] build_completion(parser) diff --git a/devscripts/buildserver.py b/devscripts/buildserver.py index fc99c3213..1344b4d87 100644 --- a/devscripts/buildserver.py +++ b/devscripts/buildserver.py @@ -424,8 +424,6 @@ class BuildHTTPRequestHandler(compat_http_server.BaseHTTPRequestHandler): self.send_header('Content-Length', len(msg)) self.end_headers() self.wfile.write(msg) - except HTTPError as e: - self.send_response(e.code, str(e)) else: self.send_response(500, 'Unknown build method "%s"' % action) else: diff --git a/devscripts/create-github-release.py b/devscripts/create-github-release.py index 3b8021e74..30716ad8e 100644 --- a/devscripts/create-github-release.py +++ b/devscripts/create-github-release.py @@ -2,11 +2,13 @@ from __future__ import unicode_literals import base64 +import io import json import mimetypes import netrc import optparse import os +import re import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -90,16 +92,23 @@ class GitHubReleaser(object): def main(): - parser = optparse.OptionParser(usage='%prog VERSION BUILDPATH') + parser = optparse.OptionParser(usage='%prog CHANGELOG VERSION BUILDPATH') options, args = parser.parse_args() - if len(args) != 2: + if len(args) != 3: parser.error('Expected a version and a build directory') - version, build_path = args + changelog_file, version, build_path = args + + with io.open(changelog_file, encoding='utf-8') as inf: + changelog = inf.read() + + mobj = re.search(r'(?s)version %s\n{2}(.+?)\n{3}' % version, changelog) + body = mobj.group(1) if mobj else '' releaser = GitHubReleaser() - new_release = releaser.create_release(version, name='youtube-dl %s' % version) + new_release = releaser.create_release( + version, name='youtube-dl %s' % version, body=body) release_id = new_release['id'] for asset in os.listdir(build_path): diff --git a/devscripts/fish-completion.py b/devscripts/fish-completion.py index 41629d87d..51d19dd33 100755 --- a/devscripts/fish-completion.py +++ b/devscripts/fish-completion.py @@ -44,5 +44,6 @@ def build_completion(opt_parser): with open(FISH_COMPLETION_FILE, 'w') as f: f.write(filled_template) + parser = youtube_dl.parseOpts()[0] build_completion(parser) diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py index 2e389fc8e..e3df42cc2 100644 --- a/devscripts/generate_aes_testdata.py +++ b/devscripts/generate_aes_testdata.py @@ -23,6 +23,7 @@ def openssl_encode(algo, key, iv): out, _ = prog.communicate(secret_msg) return out + iv = key = [0x20, 0x15] + 14 * [0] r = openssl_encode('aes-128-cbc', key, iv) diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py index 503c1372f..531c93c70 100755 --- a/devscripts/gh-pages/update-sites.py +++ b/devscripts/gh-pages/update-sites.py @@ -32,5 +32,6 @@ def main(): with open('supportedsites.html', 'w', encoding='utf-8') as sitesf: sitesf.write(template) + if __name__ == '__main__': main() diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index 2e6e6641b..c4e5fc1f4 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/devscripts/make_contributing.py b/devscripts/make_contributing.py index 5e454a429..226d1a5d6 100755 --- a/devscripts/make_contributing.py +++ b/devscripts/make_contributing.py @@ -28,5 +28,6 @@ def main(): with io.open(outfile, 'w', encoding='utf-8') as outf: outf.write(out) + if __name__ == '__main__': main() diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 9a79c2bc5..19114d30d 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -59,6 +59,7 @@ def build_lazy_ie(ie, name): s += make_valid_template.format(valid_url=ie._make_valid_url()) return s + # find the correct sorting and add the required base classes so that sublcasses # can be correctly created classes = _ALL_CLASSES[:-1] diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index 8cb4a4638..764795bc5 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -41,5 +41,6 @@ def main(): with io.open(outfile, 'w', encoding='utf-8') as outf: outf.write(out) + if __name__ == '__main__': main() diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index e3f6339b5..f9fe63f1f 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -54,21 +54,26 @@ def filter_options(readme): if in_options: if line.lstrip().startswith('-'): - option, description = re.split(r'\s{2,}', line.lstrip()) - split_option = option.split(' ') + split = re.split(r'\s{2,}', line.lstrip()) + # Description string may start with `-` as well. If there is + # only one piece then it's a description bit not an option. + if len(split) > 1: + option, description = split + split_option = option.split(' ') - if not split_option[-1].startswith('-'): # metavar - option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) + if not split_option[-1].startswith('-'): # metavar + option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) - # Pandoc's definition_lists. See http://pandoc.org/README.html - # for more information. - ret += '\n%s\n: %s\n' % (option, description) - else: - ret += line.lstrip() + '\n' + # Pandoc's definition_lists. See http://pandoc.org/README.html + # for more information. + ret += '\n%s\n: %s\n' % (option, description) + continue + ret += line.lstrip() + '\n' else: ret += line + '\n' return ret + if __name__ == '__main__': main() diff --git a/devscripts/release.sh b/devscripts/release.sh index ca6ae1b49..4db5def5d 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -60,6 +60,9 @@ if ! type pandoc >/dev/null 2>/dev/null; then echo 'ERROR: pandoc is missing'; e if ! python3 -c 'import rsa' 2>/dev/null; then echo 'ERROR: python3-rsa is missing'; exit 1; fi if ! python3 -c 'import wheel' 2>/dev/null; then echo 'ERROR: wheel is missing'; exit 1; fi +read -p "Is ChangeLog up to date? (y/n) " -n 1 +if [[ ! $REPLY =~ ^[Yy]$ ]]; then exit 1; fi + /bin/echo -e "\n### First of all, testing..." make clean if $skip_tests ; then @@ -107,7 +110,7 @@ RELEASE_FILES="youtube-dl youtube-dl.exe youtube-dl-$version.tar.gz" for f in $RELEASE_FILES; do gpg --passphrase-repeat 5 --detach-sig "build/$version/$f"; done ROOT=$(pwd) -python devscripts/create-github-release.py $version "$ROOT/build/$version" +python devscripts/create-github-release.py ChangeLog $version "$ROOT/build/$version" ssh ytdl@yt-dl.org "sh html/update_latest.sh $version" diff --git a/devscripts/zsh-completion.py b/devscripts/zsh-completion.py index 04728e8e2..60aaf76cc 100755 --- a/devscripts/zsh-completion.py +++ b/devscripts/zsh-completion.py @@ -44,5 +44,6 @@ def build_completion(opt_parser): with open(ZSH_COMPLETION_FILE, "w") as f: f.write(template) + parser = youtube_dl.parseOpts()[0] build_completion(parser) diff --git a/docs/conf.py b/docs/conf.py index 594ca61a6..0aaf1b8fc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 # # youtube-dl documentation build configuration file, created by # sphinx-quickstart on Fri Mar 14 21:05:43 2014. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2c8f950df..d900f5e12 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -13,12 +13,16 @@ - **5min** - **8tracks** - **91porn** + - **9c9media** + - **9c9media:stack** - **9gag** - **9now.com.au** - **abc.net.au** - - **Abc7News** + - **abc.net.au:iview** - **abcnews** - **abcnews:video** + - **abcotvs**: ABC Owned Television Stations + - **abcotvs:clips** - **AcademicEarth:Course** - **acast** - **acast:channel** @@ -29,12 +33,14 @@ - **AdobeTVVideo** - **AdultSwim** - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network - - **AfreecaTV**: afreecatv.com - - **Aftonbladet** + - **afreecatv**: afreecatv.com + - **afreecatv:global**: afreecatv.com - **AirMozilla** - **AlJazeera** - **Allocine** - **AlphaPorno** + - **AMCNetworks** + - **anderetijden**: npo.nl and ntr.nl - **AnimeOnDemand** - **anitube.se** - **AnySex** @@ -65,6 +71,12 @@ - **audiomack** - **audiomack:album** - **auroravid**: AuroraVid + - **AWAAN** + - **awaan:live** + - **awaan:season** + - **awaan:video** + - **AZMedien**: AZ Medien videos + - **AZMedienPlaylist**: AZ Medien playlists - **Azubu** - **AzubuLive** - **BaiduVideo**: 百度视频 @@ -77,9 +89,11 @@ - **bbc.co.uk:article**: BBC articles - **bbc.co.uk:iplayer:playlist** - **bbc.co.uk:playlist** - - **BeatportPro** + - **Beam:live** + - **Beatport** - **Beeg** - **BehindKink** + - **BellMedia** - **Bet** - **Bigflix** - **Bild**: Bild.de @@ -101,6 +115,7 @@ - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - **BuzzFeed** - **BYUtv** + - **BYUtvEvent** - **Camdemy** - **CamdemyFolder** - **CamWithHer** @@ -109,17 +124,23 @@ - **Canvas** - **CarambaTV** - **CarambaTVPage** - - **CBC** - - **CBCPlayer** + - **CartoonNetwork** + - **cbc.ca** + - **cbc.ca:player** + - **cbc.ca:watch** + - **cbc.ca:watch:video** - **CBS** - **CBSInteractive** - **CBSLocal** - - **CBSNews**: CBS News - - **CBSNewsLiveVideo**: CBS News Live Videos + - **cbsnews**: CBS News + - **cbsnews:livevideo**: CBS News Live Videos - **CBSSports** + - **CCMA** + - **CCTV**: 央视网 - **CDA** - **CeskaTelevize** - **channel9**: Channel 9 + - **CharlieRose** - **Chaturbate** - **Chilloutzone** - **chirbit** @@ -142,6 +163,7 @@ - **CollegeRama** - **ComCarCoff** - **ComedyCentral** + - **ComedyCentralFullEpisodes** - **ComedyCentralShortname** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED @@ -155,10 +177,11 @@ - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - - **CTV** - **CTVNews** - **culturebox.francetvinfo.fr** - **CultureUnplugged** + - **curiositystream** + - **curiositystream:collection** - **CWTV** - **DailyMail** - **dailymotion** @@ -170,10 +193,6 @@ - **daum.net:playlist** - **daum.net:user** - **DBTV** - - **DCN** - - **dcn:live** - - **dcn:season** - - **dcn:video** - **DctpTv** - **DeezerPlaylist** - **defense.gouv.fr** @@ -182,6 +201,8 @@ - **DigitallySpeaking** - **Digiteka** - **Discovery** + - **DiscoveryGo** + - **Disney** - **Dotsub** - **DouyuTV**: 斗鱼 - **DPlay** @@ -198,6 +219,7 @@ - **EaglePlatform** - **EbaumsWorld** - **EchoMsk** + - **egghead:course**: egghead.io course - **eHow** - **Einthusan** - **eitb.tv** @@ -211,18 +233,19 @@ - **EroProfile** - **Escapist** - **ESPN** + - **ESPNArticle** - **EsriVideo** - **Europa** - **EveryonesMixtape** - - **exfm**: ex.fm - **ExpoTV** - **ExtremeTube** - **EyedoTV** - **facebook** + - **FacebookPluginsVideo** - **faz.net** - **fc2** + - **fc2:embed** - **Fczenit** - - **features.aol.com** - **fernsehkritik.tv** - **Firstpost** - **FiveTV** @@ -232,21 +255,23 @@ - **FootyRoom** - **Formula1** - **FOX** + - **FOX9** - **Foxgay** - - **FoxNews**: Fox News and Fox Business Video + - **foxnews**: Fox News and Fox Business Video + - **foxnews:article** + - **foxnews:insider** - **FoxSports** - **france2.fr:generation-quoi** - **FranceCulture** - - **FranceCultureEmission** - **FranceInter** - **francetv**: France 2, 3, 4, 5 and Ô - **francetvinfo.fr** - **Freesound** - **freespeech.org** - - **FreeVideo** - **Funimation** - **FunnyOrDie** - **Fusion** + - **FXNetworks** - **GameInformer** - **GameOne** - **gameone:playlist** @@ -262,9 +287,9 @@ - **Glide**: Glide mobile video messages (glide.me) - **Globo** - **GloboArticle** + - **Go** - **GodTube** - **GodTV** - - **GoldenMoustache** - **Golem** - **GoogleDrive** - **Goshgay** @@ -272,15 +297,19 @@ - **Groupon** - **Hark** - **HBO** + - **HBOEpisode** - **HearThisAt** - **Heise** - **HellPorno** - **Helsinki**: helsinki.fi - **HentaiStigma** + - **HGTV** + - **hgtv.com:show** - **HistoricFilms** - **history:topic**: History.com Topic - **hitbox** - **hitbox:live** + - **HitRecord** - **HornBunny** - **HotNewHipHop** - **HotStar** @@ -288,6 +317,7 @@ - **HowStuffWorks** - **HRTi** - **HRTiPlaylist** + - **Huajiao**: 花椒直播 - **HuffPost**: Huffington Post - **Hypem** - **Iconosquare** @@ -297,6 +327,7 @@ - **Imgur** - **ImgurAlbum** - **Ina** + - **Inc** - **Indavideo** - **IndavideoEmbed** - **InfoQ** @@ -306,10 +337,14 @@ - **IPrima** - **iqiyi**: 爱奇艺 - **Ir90Tv** + - **ITV** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV + - **Iwara** - **Izlesene** + - **Jamendo** + - **JamendoAlbum** - **JeuxVideo** - **Jove** - **jpopsuki.tv** @@ -322,6 +357,7 @@ - **KarriereVideos** - **keek** - **KeezMovies** + - **Ketnet** - **KhanAcademy** - **KickStarter** - **KonserthusetPlay** @@ -336,12 +372,15 @@ - **kuwo:singer**: 酷我音乐 - 歌手 - **kuwo:song**: 酷我音乐 - **la7.it** - - **Laola1Tv** + - **laola1tv** + - **laola1tv:embed** + - **LCI** - **Lcp** - **LcpPlay** - **Le**: 乐视网 - **Learnr** - **Lecture2Go** + - **LEGO** - **Lemonde** - **LePlaylist** - **LetvCloud**: 乐视云 @@ -367,14 +406,19 @@ - **mailru**: Видео@Mail.Ru - **MakersChannel** - **MakerTV** + - **mangomolo:live** + - **mangomolo:video** - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** + - **Meipai**: 美拍 + - **MelonVOD** - **META** - **metacafe** - **Metacritic** - **Mgoon** - **MGTV**: 芒果TV + - **MiaoPai** - **Minhateca** - **MinistryGrid** - **Minoto** @@ -396,10 +440,14 @@ - **MovieClips** - **MovieFap** - **Moviezine** + - **MovingImage** - **MPORA** - **MSN** - - **MTV** + - **mtg**: MTG services + - **mtv** - **mtv.de** + - **mtv81** + - **mtv:video** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** @@ -421,6 +469,7 @@ - **NBA** - **NBC** - **NBCNews** + - **NBCOlympics** - **NBCSports** - **NBCSportsVPlayer** - **ndr**: NDR.de - Norddeutscher Rundfunk @@ -440,20 +489,23 @@ - **Newstube** - **NextMedia**: 蘋果日報 - **NextMediaActionNews**: 蘋果日報 - 動新聞 + - **NextTV**: 壹電視 - **nfb**: National Film Board of Canada - **nfl.com** + - **NhkVod** - **nhl.com** - **nhl.com:news**: NHL news - **nhl.com:videocenter** - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **nick.de** + - **nicknight** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** - - **NineCNineMedia** - **Nintendo** - **njoy**: N-JOY - **njoy:embed** + - **NobelPrize** - **Noco** - **Normalboots** - **NosVideo** @@ -474,15 +526,20 @@ - **NRKPlaylist** - **NRKSkole**: NRK Skole - **NRKTV**: NRK TV and NRK Radio + - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte + - **NRKTVEpisodes** + - **NRKTVSeries** - **ntv.ru** - **Nuvid** - **NYTimes** - **NYTimesArticle** + - **NZZ** - **ocw.mit.edu** - **OdaTV** - **Odnoklassniki** - **OktoberfestTV** - **on.aol.com** + - **OnDemandKorea** - **onet.tv** - **onet.tv:channel** - **OnionStudios** @@ -494,6 +551,7 @@ - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek + - **PandaTV**: 熊猫TV - **pandora.tv**: 판도라TV - **parliamentlive.tv**: UK parliament videos - **Patreon** @@ -505,10 +563,10 @@ - **PhilharmonieDeParis**: Philharmonie de Paris - **phoenix.de** - **Photobucket** + - **Piksel** - **Pinkbike** - **Pladform** - **play.fm** - - **played.to** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** @@ -518,7 +576,11 @@ - **plus.google**: Google Plus - **pluzz.francetv.fr** - **podomatic** + - **Pokemon** - **PolskieRadio** + - **PolskieRadioCategory** + - **PornCom** + - **PornFlip** - **PornHd** - **PornHub**: PornHub and Thumbzilla - **PornHubPlaylist** @@ -551,6 +613,8 @@ - **RDS**: RDS.ca - **RedTube** - **RegioTV** + - **RENTV** + - **RENTVArticle** - **Restudy** - **Reuters** - **ReverbNation** @@ -558,10 +622,12 @@ - **revision3:embed** - **RICE** - **RingTV** + - **RMCDecouverte** - **RockstarGames** - **RoosterTeeth** - **RottenTomatoes** - **Roxwel** + - **Rozhlas** - **RTBF** - **rte**: Raidió Teilifís Éireann TV - **rte:radio**: Raidió Teilifís Éireann radio @@ -596,16 +662,14 @@ - **screen.yahoo:search**: Yahoo screen search - **Screencast** - **ScreencastOMatic** - - **ScreenJunkies** - - **ScreenwaveMedia** - **Seeker** - **SenateISVP** - **SendtoNews** - **ServingSys** - **Sexu** - **Shahid** - - **Shared**: shared.sx and vivo.sx - - **ShareSix** + - **Shared**: shared.sx + - **ShowRoomLive** - **Sina** - **SixPlay** - **skynewsarabia:article** @@ -619,6 +683,7 @@ - **smotri:user**: Smotri.com user videos - **Snotr** - **Sohu** + - **SonyLIV** - **soundcloud** - **soundcloud:playlist** - **soundcloud:search**: Soundcloud search @@ -645,7 +710,6 @@ - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - - **SSA** - **stanfordoc**: Stanford Open ClassRoom - **Steam** - **Stitcher** @@ -659,16 +723,17 @@ - **SWRMediathek** - **Syfy** - **SztvHu** + - **t-online.de** - **Tagesschau** - **tagesschau:player** - - **Tapely** - **Tass** + - **TBS** - **TDSLifeway** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos - **TeachingChannel** - **Teamcoco** - - **TeamFour** + - **TeamFourStar** - **TechTalks** - **techtv.mit.edu** - **ted** @@ -677,19 +742,22 @@ - **Telecinco**: telecinco.es, cuatro.com and mediaset.es - **Telegraaf** - **TeleMB** + - **TeleQuebec** - **TeleTask** - **Telewebion** - **TF1** + - **TFO** - **TheIntercept** + - **theoperaplatform** - **ThePlatform** - **ThePlatformFeed** - **TheScene** - **TheSixtyOne** - **TheStar** + - **TheWeatherChannel** - **ThisAmericanLife** - **ThisAV** - - **THVideo** - - **THVideoPlaylist** + - **ThisOldHouse** - **tinypic**: tinypic.com videos - **tlc.de** - **TMZ** @@ -703,8 +771,7 @@ - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** - - **trollvids** - - **TruTube** + - **TruTV** - **Tube8** - **TubiTv** - **tudou** @@ -722,21 +789,27 @@ - **TV2Article** - **TV3** - **TV4**: tv4.se and tv4play.se + - **TVA** + - **TVANouvelles** + - **TVANouvellesArticle** - **TVC** - **TVCArticle** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** + - **TVNoe** - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** - - **TVPlay**: TV3Play and related services - **Tweakers** - **twitch:chapter** - **twitch:clips** - - **twitch:past_broadcasts** - **twitch:profile** - **twitch:stream** - **twitch:video** + - **twitch:videos:all** + - **twitch:videos:highlights** + - **twitch:videos:past-broadcasts** + - **twitch:videos:uploads** - **twitch:vod** - **twitter** - **twitter:amplify** @@ -744,9 +817,14 @@ - **udemy** - **udemy:course** - **UDNEmbed**: 聯合影音 + - **UKTVPlay** - **Unistra** + - **uol.com.br** + - **uplynk** + - **uplynk:preplay** - **Urort**: NRK P3 Urørt - **URPlay** + - **USANetwork** - **USAToday** - **ustream** - **ustream:channel** @@ -762,10 +840,13 @@ - **VevoPlaylist** - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** + - **Viafree** - **Vice** + - **Viceland** - **ViceShow** - **Vidbit** - **Viddler** + - **Videa** - **video.google:search**: Google Video search - **video.mit.edu** - **VideoDetective** @@ -775,7 +856,6 @@ - **videomore:season** - **videomore:video** - **VideoPremium** - - **VideoTt**: video.tt - Your True Tube (Currently broken) - **videoweed**: VideoWeed - **Vidio** - **vidme** @@ -802,10 +882,15 @@ - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** + - **Viu** + - **viu:ott** + - **viu:playlist** + - **Vivo**: vivo.sx - **vk**: VK - **vk:uservideos**: VK - User's Videos - **vk:wallpost** - **vlive** + - **vlive:channel** - **Vodlocker** - **VODPlatform** - **VoiceRepublic** @@ -815,6 +900,9 @@ - **VRT** - **vube**: Vube.com - **VuClip** + - **VVVVID** + - **VyboryMos** + - **Vzaar** - **Walla** - **washingtonpost** - **washingtonpost:article** @@ -822,13 +910,15 @@ - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile** + - **Webcaster** + - **WebcasterFeed** - **WebOfStories** - **WebOfStoriesPlaylist** - **WeiqiTV**: WQTV - **wholecloud**: WholeCloud - **Wimp** - **Wistia** - - **WNL** + - **wnl**: npo.nl and ntr.nl - **WorldStarHipHop** - **wrzuta.pl** - **wrzuta.pl:playlist** @@ -882,6 +972,4 @@ - **Zapiks** - **ZDF** - **ZDFChannel** - - **zingmp3:album**: mp3.zing.vn albums - - **zingmp3:song**: mp3.zing.vn songs - - **ZippCast** + - **zingmp3**: mp3.zing.vn diff --git a/setup.py b/setup.py index 508b27f37..ce6dd1870 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import print_function diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index a98305c74..437c7270e 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -84,5 +84,6 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(ExtractorError, self.ie._download_json, uri, None) self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) + if __name__ == '__main__': unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 0dfe25c00..8bf00bea9 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -605,6 +605,7 @@ class TestYoutubeDL(unittest.TestCase): 'extractor': 'TEST', 'duration': 30, 'filesize': 10 * 1024, + 'playlist_id': '42', } second = { 'id': '2', @@ -614,6 +615,7 @@ class TestYoutubeDL(unittest.TestCase): 'duration': 10, 'description': 'foo', 'filesize': 5 * 1024, + 'playlist_id': '43', } videos = [first, second] @@ -650,6 +652,10 @@ class TestYoutubeDL(unittest.TestCase): res = get_videos(f) self.assertEqual(res, ['1']) + f = match_filter_func('playlist_id = 42') + res = get_videos(f) + self.assertEqual(res, ['1']) + def test_playlist_items_selection(self): entries = [{ 'id': compat_str(i), diff --git a/test/test_aes.py b/test/test_aes.py index 315a3f5ae..54078a66d 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -51,5 +51,6 @@ class TestAES(unittest.TestCase): decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) + if __name__ == '__main__': unittest.main() diff --git a/test/test_download.py b/test/test_download.py index a3f1c0644..463952989 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -60,6 +60,7 @@ def _file_md5(fn): with open(fn, 'rb') as f: return hashlib.md5(f.read()).hexdigest() + defs = gettestcases() @@ -217,6 +218,7 @@ def generator(test_case): return test_template + # And add them to TestDownload for n, test_case in enumerate(defs): test_method = generator(test_case) diff --git a/test/test_execution.py b/test/test_execution.py index 620db080e..11661bb68 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -39,5 +39,6 @@ class TestExecution(unittest.TestCase): _, stderr = p.communicate() self.assertFalse(stderr) + if __name__ == '__main__': unittest.main() diff --git a/test/test_http.py b/test/test_http.py index fdc68ccb4..7a7a3510f 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -87,7 +87,7 @@ class TestHTTP(unittest.TestCase): ydl = YoutubeDL({'logger': FakeLogger()}) r = ydl.extract_info('http://localhost:%d/302' % self.port) - self.assertEqual(r['url'], 'http://localhost:%d/vid.mp4' % self.port) + self.assertEqual(r['entries'][0]['url'], 'http://localhost:%d/vid.mp4' % self.port) class TestHTTPS(unittest.TestCase): @@ -111,7 +111,7 @@ class TestHTTPS(unittest.TestCase): ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True}) r = ydl.extract_info('https://localhost:%d/video.html' % self.port) - self.assertEqual(r['url'], 'https://localhost:%d/vid.mp4' % self.port) + self.assertEqual(r['entries'][0]['url'], 'https://localhost:%d/vid.mp4' % self.port) def _build_proxy_handler(name): @@ -169,5 +169,6 @@ class TestProxy(unittest.TestCase): # b'xn--fiq228c' is '中文'.encode('idna') self.assertEqual(response, 'normal: http://xn--fiq228c.tw/') + if __name__ == '__main__': unittest.main() diff --git a/test/test_iqiyi_sdk_interpreter.py b/test/test_iqiyi_sdk_interpreter.py index 9d95cb606..789059dbe 100644 --- a/test/test_iqiyi_sdk_interpreter.py +++ b/test/test_iqiyi_sdk_interpreter.py @@ -43,5 +43,6 @@ class TestIqiyiSDKInterpreter(unittest.TestCase): ie._login() self.assertTrue('unable to log in:' in logger.messages[0]) + if __name__ == '__main__': unittest.main() diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 63c350b8f..c24b8ca74 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -104,6 +104,14 @@ class TestJSInterpreter(unittest.TestCase): }''') self.assertEqual(jsi.call_function('x'), [20, 20, 30, 40, 50]) + def test_call(self): + jsi = JSInterpreter(''' + function x() { return 2; } + function y(a) { return x() + a; } + function z() { return y(3); } + ''') + self.assertEqual(jsi.call_function('z'), 5) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 5a2ae4a1e..edc712f07 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -39,9 +39,12 @@ from youtube_dl.utils import ( is_html, js_to_json, limit_length, + mimetype2ext, + month_by_name, ohdave_rsa_encrypt, OnDemandPagedList, orderedSet, + parse_age_limit, parse_duration, parse_filesize, parse_count, @@ -66,6 +69,8 @@ from youtube_dl.utils import ( uppercase_escape, lowercase_escape, url_basename, + base_url, + urljoin, urlencode_postdata, urshift, update_url_query, @@ -289,6 +294,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_strdate('25-09-2014'), '20140925') self.assertEqual(unified_strdate('27.02.2016 17:30'), '20160227') self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) + self.assertEqual(unified_strdate('Feb 7, 2016 at 6:35 pm'), '20160207') + self.assertEqual(unified_strdate('July 15th, 2013'), '20130715') + self.assertEqual(unified_strdate('September 1st, 2013'), '20130901') + self.assertEqual(unified_strdate('Sep 2nd, 2013'), '20130902') def test_unified_timestamps(self): self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600) @@ -309,6 +318,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('27.02.2016 17:30'), 1456594200) self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None) self.assertEqual(unified_timestamp('May 16, 2016 11:15 PM'), 1463440500) + self.assertEqual(unified_timestamp('Feb 7, 2016 at 6:35 pm'), 1454870100) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') @@ -432,6 +442,44 @@ class TestUtil(unittest.TestCase): url_basename('http://media.w3.org/2010/05/sintel/trailer.mp4'), 'trailer.mp4') + def test_base_url(self): + self.assertEqual(base_url('http://foo.de/'), 'http://foo.de/') + self.assertEqual(base_url('http://foo.de/bar'), 'http://foo.de/') + self.assertEqual(base_url('http://foo.de/bar/'), 'http://foo.de/bar/') + self.assertEqual(base_url('http://foo.de/bar/baz'), 'http://foo.de/bar/') + self.assertEqual(base_url('http://foo.de/bar/baz?x=z/x/c'), 'http://foo.de/bar/') + + def test_urljoin(self): + self.assertEqual(urljoin('http://foo.de/', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('//foo.de/', '/a/b/c.txt'), '//foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de/', 'a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de', 'a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de/', 'http://foo.de/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de/', '//foo.de/a/b/c.txt'), '//foo.de/a/b/c.txt') + self.assertEqual(urljoin(None, 'http://foo.de/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin(None, '//foo.de/a/b/c.txt'), '//foo.de/a/b/c.txt') + self.assertEqual(urljoin('', 'http://foo.de/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin(['foobar'], 'http://foo.de/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de/', None), None) + self.assertEqual(urljoin('http://foo.de/', ''), None) + self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) + self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') + + def test_parse_age_limit(self): + self.assertEqual(parse_age_limit(None), None) + self.assertEqual(parse_age_limit(False), None) + self.assertEqual(parse_age_limit('invalid'), None) + self.assertEqual(parse_age_limit(0), 0) + self.assertEqual(parse_age_limit(18), 18) + self.assertEqual(parse_age_limit(21), 21) + self.assertEqual(parse_age_limit(22), None) + self.assertEqual(parse_age_limit('18'), 18) + self.assertEqual(parse_age_limit('18+'), 18) + self.assertEqual(parse_age_limit('PG-13'), 13) + self.assertEqual(parse_age_limit('TV-14'), 14) + self.assertEqual(parse_age_limit('TV-MA'), 17) + def test_parse_duration(self): self.assertEqual(parse_duration(None), None) self.assertEqual(parse_duration(False), None) @@ -462,6 +510,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('1 hour 3 minutes'), 3780) self.assertEqual(parse_duration('87 Min.'), 5220) self.assertEqual(parse_duration('PT1H0.040S'), 3600.04) + self.assertEqual(parse_duration('PT00H03M30SZ'), 210) def test_fix_xml_ampersands(self): self.assertEqual( @@ -610,6 +659,22 @@ class TestUtil(unittest.TestCase): limit_length('foo bar baz asd', 12).startswith('foo bar')) self.assertTrue('...' in limit_length('foo bar baz asd', 12)) + def test_mimetype2ext(self): + self.assertEqual(mimetype2ext(None), None) + self.assertEqual(mimetype2ext('video/x-flv'), 'flv') + self.assertEqual(mimetype2ext('application/x-mpegURL'), 'm3u8') + self.assertEqual(mimetype2ext('text/vtt'), 'vtt') + self.assertEqual(mimetype2ext('text/vtt;charset=utf-8'), 'vtt') + self.assertEqual(mimetype2ext('text/html; charset=utf-8'), 'html') + + def test_month_by_name(self): + self.assertEqual(month_by_name(None), None) + self.assertEqual(month_by_name('December', 'en'), 12) + self.assertEqual(month_by_name('décembre', 'fr'), 12) + self.assertEqual(month_by_name('December'), 12) + self.assertEqual(month_by_name('décembre'), None) + self.assertEqual(month_by_name('Unknown', 'unknown'), None) + def test_parse_codecs(self): self.assertEqual(parse_codecs(''), {}) self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), { @@ -697,6 +762,9 @@ class TestUtil(unittest.TestCase): inp = '''{"foo":101}''' self.assertEqual(js_to_json(inp), '''{"foo":101}''') + inp = '''{"duration": "00:01:07"}''' + self.assertEqual(js_to_json(inp), '''{"duration": "00:01:07"}''') + def test_js_to_json_edgecases(self): on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}") self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"}) @@ -717,12 +785,27 @@ class TestUtil(unittest.TestCase): on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) + on = js_to_json('[/*comment\n*/"abc"/*comment\n*/,/*comment\n*/"def",/*comment\n*/]') + self.assertEqual(json.loads(on), ['abc', 'def']) + + on = js_to_json('[//comment\n"abc" //comment\n,//comment\n"def",//comment\n]') + self.assertEqual(json.loads(on), ['abc', 'def']) + on = js_to_json('{"abc": "def",}') self.assertEqual(json.loads(on), {'abc': 'def'}) + on = js_to_json('{/*comment\n*/"abc"/*comment\n*/:/*comment\n*/"def"/*comment\n*/,/*comment\n*/}') + self.assertEqual(json.loads(on), {'abc': 'def'}) + on = js_to_json('{ 0: /* " \n */ ",]" , }') self.assertEqual(json.loads(on), {'0': ',]'}) + on = js_to_json('{ /*comment\n*/0/*comment\n*/: /* " \n */ ",]" , }') + self.assertEqual(json.loads(on), {'0': ',]'}) + + on = js_to_json('{ 0: // comment\n1 }') + self.assertEqual(json.loads(on), {'0': 1}) + on = js_to_json(r'["

x<\/p>"]') self.assertEqual(json.loads(on), ['

x

']) @@ -732,15 +815,27 @@ class TestUtil(unittest.TestCase): on = js_to_json("['a\\\nb']") self.assertEqual(json.loads(on), ['ab']) + on = js_to_json("/*comment\n*/[/*comment\n*/'a\\\nb'/*comment\n*/]/*comment\n*/") + self.assertEqual(json.loads(on), ['ab']) + on = js_to_json('{0xff:0xff}') self.assertEqual(json.loads(on), {'255': 255}) + on = js_to_json('{/*comment\n*/0xff/*comment\n*/:/*comment\n*/0xff/*comment\n*/}') + self.assertEqual(json.loads(on), {'255': 255}) + on = js_to_json('{077:077}') self.assertEqual(json.loads(on), {'63': 63}) + on = js_to_json('{/*comment\n*/077/*comment\n*/:/*comment\n*/077/*comment\n*/}') + self.assertEqual(json.loads(on), {'63': 63}) + on = js_to_json('{42:42}') self.assertEqual(json.loads(on), {'42': 42}) + on = js_to_json('{/*comment\n*/42/*comment\n*/:/*comment\n*/42/*comment\n*/}') + self.assertEqual(json.loads(on), {'42': 42}) + def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) @@ -802,7 +897,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_filesize('2 MiB'), 2097152) self.assertEqual(parse_filesize('5 GB'), 5000000000) self.assertEqual(parse_filesize('1.2Tb'), 1200000000000) + self.assertEqual(parse_filesize('1.2tb'), 1200000000000) self.assertEqual(parse_filesize('1,24 KB'), 1240) + self.assertEqual(parse_filesize('1,24 kb'), 1240) + self.assertEqual(parse_filesize('8.5 megabytes'), 8500000) def test_parse_count(self): self.assertEqual(parse_count(None), None) @@ -953,6 +1051,7 @@ The first line self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) self.assertEqual(cli_option({}, '--proxy', 'proxy'), []) + self.assertEqual(cli_option({'retries': 10}, '--retries', 'retries'), ['--retries', '10']) def test_cli_valueless_option(self): self.assertEqual(cli_valueless_option( @@ -1025,5 +1124,6 @@ The first line self.assertEqual(get_element_by_class('foo', html), 'nice') self.assertEqual(get_element_by_class('no-such-class', html), None) + if __name__ == '__main__': unittest.main() diff --git a/test/test_verbose_output.py b/test/test_verbose_output.py index 4c77df242..c1465fe8c 100644 --- a/test/test_verbose_output.py +++ b/test/test_verbose_output.py @@ -22,10 +22,10 @@ class TestVerboseOutput(unittest.TestCase): '--password', 'secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sout, serr = outp.communicate() - self.assertTrue('--username' in serr) - self.assertTrue('johnsmith' not in serr) - self.assertTrue('--password' in serr) - self.assertTrue('secret' not in serr) + self.assertTrue(b'--username' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'--password' in serr) + self.assertTrue(b'secret' not in serr) def test_private_info_shortarg(self): outp = subprocess.Popen( @@ -35,10 +35,10 @@ class TestVerboseOutput(unittest.TestCase): '-p', 'secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sout, serr = outp.communicate() - self.assertTrue('-u' in serr) - self.assertTrue('johnsmith' not in serr) - self.assertTrue('-p' in serr) - self.assertTrue('secret' not in serr) + self.assertTrue(b'-u' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'-p' in serr) + self.assertTrue(b'secret' not in serr) def test_private_info_eq(self): outp = subprocess.Popen( @@ -48,10 +48,10 @@ class TestVerboseOutput(unittest.TestCase): '--password=secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sout, serr = outp.communicate() - self.assertTrue('--username' in serr) - self.assertTrue('johnsmith' not in serr) - self.assertTrue('--password' in serr) - self.assertTrue('secret' not in serr) + self.assertTrue(b'--username' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'--password' in serr) + self.assertTrue(b'secret' not in serr) def test_private_info_shortarg_eq(self): outp = subprocess.Popen( @@ -61,10 +61,11 @@ class TestVerboseOutput(unittest.TestCase): '-p=secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sout, serr = outp.communicate() - self.assertTrue('-u' in serr) - self.assertTrue('johnsmith' not in serr) - self.assertTrue('-p' in serr) - self.assertTrue('secret' not in serr) + self.assertTrue(b'-u' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'-p' in serr) + self.assertTrue(b'secret' not in serr) + if __name__ == '__main__': unittest.main() diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py index 8de08f2d6..41abdfe3b 100644 --- a/test/test_write_annotations.py +++ b/test/test_write_annotations.py @@ -24,6 +24,7 @@ class YoutubeDL(youtube_dl.YoutubeDL): super(YoutubeDL, self).__init__(*args, **kwargs) self.to_stderr = self.to_screen + params = get_params({ 'writeannotations': True, 'skip_download': True, @@ -74,5 +75,6 @@ class TestAnnotations(unittest.TestCase): def tearDown(self): try_rm(ANNOTATIONS_FILE) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index af1c45421..7a33dbf88 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -66,5 +66,6 @@ class TestYoutubeLists(unittest.TestCase): for entry in result['entries']: self.assertTrue(entry.get('title')) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 060864434..f0c370eee 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -114,6 +114,7 @@ def make_tfunc(url, stype, sig_input, expected_sig): test_func.__name__ = str('test_signature_' + stype + '_' + test_id) setattr(TestSignature, test_func.__name__, test_func) + for test_spec in _TESTS: make_tfunc(*test_spec) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6551f086f..a7bf5a1b0 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import absolute_import, unicode_literals @@ -24,6 +24,7 @@ import sys import time import tokenize import traceback +import random from .compat import ( compat_basestring, @@ -131,6 +132,9 @@ class YoutubeDL(object): username: Username for authentication purposes. password: Password for authentication purposes. videopassword: Password for accessing a video. + ap_mso: Adobe Pass multiple-system operator identifier. + ap_username: Multiple-system operator account username. + ap_password: Multiple-system operator account password. usenetrc: Use netrc for authentication instead. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. @@ -156,6 +160,7 @@ class YoutubeDL(object): playlistend: Playlist item to end at. playlist_items: Specific indices of playlist to download. playlistreverse: Download playlist items in reverse order. + playlistrandom: Download playlist items in random order. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. @@ -249,7 +254,16 @@ class YoutubeDL(object): source_address: (Experimental) Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. - sleep_interval: Number of seconds to sleep before each download. + sleep_interval: Number of seconds to sleep before each download when + used alone or a lower bound of a range for randomized + sleep before each download (minimum possible number + of seconds to sleep) when used along with + max_sleep_interval. + max_sleep_interval:Upper bound of a range for randomized sleep before each + download (maximum possible number of seconds to sleep). + Must only be used along with sleep_interval. + Actual sleep time will be a random float from range + [sleep_interval; max_sleep_interval]. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called with the info_dict of @@ -572,7 +586,7 @@ class YoutubeDL(object): if autonumber_size is None: autonumber_size = 5 autonumber_templ = '%0' + str(autonumber_size) + 'd' - template_dict['autonumber'] = autonumber_templ % self._num_downloads + template_dict['autonumber'] = autonumber_templ % (self.params.get('autonumber_start', 1) - 1 + self._num_downloads) if template_dict.get('playlist_index') is not None: template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index']) if template_dict.get('resolution') is None: @@ -830,6 +844,9 @@ class YoutubeDL(object): if self.params.get('playlistreverse', False): entries = entries[::-1] + if self.params.get('playlistrandom', False): + random.shuffle(entries) + for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) extra = { @@ -1247,8 +1264,10 @@ class YoutubeDL(object): info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] if thumbnails: thumbnails.sort(key=lambda t: ( - t.get('preference'), t.get('width'), t.get('height'), - t.get('id'), t.get('url'))) + t.get('preference') if t.get('preference') is not None else -1, + t.get('width') if t.get('width') is not None else -1, + t.get('height') if t.get('height') is not None else -1, + t.get('id') if t.get('id') is not None else '', t.get('url'))) for i, t in enumerate(thumbnails): t['url'] = sanitize_url(t['url']) if t.get('width') and t.get('height'): @@ -1290,7 +1309,7 @@ class YoutubeDL(object): for subtitle_format in subtitle: if subtitle_format.get('url'): subtitle_format['url'] = sanitize_url(subtitle_format['url']) - if 'ext' not in subtitle_format: + if subtitle_format.get('ext') is None: subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() if self.params.get('listsubtitles', False): @@ -1325,7 +1344,7 @@ class YoutubeDL(object): format['format_id'] = compat_str(i) else: # Sanitize format_id from characters used in format selector expression - format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id']) + format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id']) format_id = format['format_id'] if format_id not in formats_dict: formats_dict[format_id] = [] @@ -1345,11 +1364,11 @@ class YoutubeDL(object): note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', ) # Automatically determine file extension if missing - if 'ext' not in format: + if format.get('ext') is None: format['ext'] = determine_ext(format['url']).lower() # Automatically determine protocol if missing (useful for format # selection purposes) - if 'protocol' not in format: + if format.get('protocol') is None: format['protocol'] = determine_protocol(format) # Add HTTP headers, so that external programs can use them from the # json output @@ -1594,7 +1613,9 @@ class YoutubeDL(object): self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/rg3/youtube-dl/issues/10268 + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) @@ -1642,7 +1663,7 @@ class YoutubeDL(object): video_ext, audio_ext = audio.get('ext'), video.get('ext') if video_ext and audio_ext: COMPATIBLE_EXTS = ( - ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'), + ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'), ('webm') ) for exts in COMPATIBLE_EXTS: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 2b34bf9c2..5c5b8094b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals @@ -34,12 +34,14 @@ from .utils import ( setproctitle, std_headers, write_string, + render_table, ) from .update import update_self from .downloader import ( FileDownloader, ) from .extractor import gen_extractors, list_extractors +from .extractor.adobepass import MSO_INFO from .YoutubeDL import YoutubeDL @@ -93,8 +95,7 @@ def _real_main(argv=None): write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') except IOError: sys.exit('ERROR: batch file could not be read') - all_urls = batch_urls + args - all_urls = [url.strip() for url in all_urls] + all_urls = batch_urls + [url.strip() for url in args] # batch_urls are already striped in read_batch_urls _enc = preferredencoding() all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] @@ -118,18 +119,32 @@ def _real_main(argv=None): desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) write_string(desc + '\n', out=sys.stdout) sys.exit(0) + if opts.ap_list_mso: + table = [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()] + write_string('Supported TV Providers:\n' + render_table(['mso', 'mso name'], table) + '\n', out=sys.stdout) + sys.exit(0) # Conflicting, missing and erroneous options if opts.usenetrc and (opts.username is not None or opts.password is not None): parser.error('using .netrc conflicts with giving username/password') if opts.password is not None and opts.username is None: parser.error('account username missing\n') + if opts.ap_password is not None and opts.ap_username is None: + parser.error('TV Provider account username missing\n') if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid): parser.error('using output template conflicts with using title, video ID or auto number') + if opts.autonumber_size is not None: + if opts.autonumber_size <= 0: + parser.error('auto number size must be positive') + if opts.autonumber_start is not None: + if opts.autonumber_start < 0: + parser.error('auto number start must be positive or 0') if opts.usetitle and opts.useid: parser.error('using title conflicts with using video ID') if opts.username is not None and opts.password is None: opts.password = compat_getpass('Type account password and press [Return]: ') + if opts.ap_username is not None and opts.ap_password is None: + opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') if opts.ratelimit is not None: numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) if numeric_limit is None: @@ -145,6 +160,18 @@ def _real_main(argv=None): if numeric_limit is None: parser.error('invalid max_filesize specified') opts.max_filesize = numeric_limit + if opts.sleep_interval is not None: + if opts.sleep_interval < 0: + parser.error('sleep interval must be positive or 0') + if opts.max_sleep_interval is not None: + if opts.max_sleep_interval < 0: + parser.error('max sleep interval must be positive or 0') + if opts.max_sleep_interval < opts.sleep_interval: + parser.error('max sleep interval must be greater than or equal to min sleep interval') + else: + opts.max_sleep_interval = opts.sleep_interval + if opts.ap_mso and opts.ap_mso not in MSO_INFO: + parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers') def parse_retries(retries): if retries in ('inf', 'infinite'): @@ -244,8 +271,6 @@ def _real_main(argv=None): postprocessors.append({ 'key': 'FFmpegEmbedSubtitle', }) - if opts.xattrs: - postprocessors.append({'key': 'XAttrMetadata'}) if opts.embedthumbnail: already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails postprocessors.append({ @@ -254,6 +279,10 @@ def _real_main(argv=None): }) if not already_have_thumbnail: opts.writethumbnail = True + # XAttrMetadataPP should be run after post-processors that may change file + # contents + if opts.xattrs: + postprocessors.append({'key': 'XAttrMetadata'}) # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way. # So if the user is able to remove the file before your postprocessor runs it might cause a few problems. if opts.exec_cmd: @@ -261,12 +290,6 @@ def _real_main(argv=None): 'key': 'ExecAfterDownload', 'exec_cmd': opts.exec_cmd, }) - if opts.xattr_set_filesize: - try: - import xattr - xattr # Confuse flake8 - except ImportError: - parser.error('setting filesize xattr requested but python-xattr is not available') external_downloader_args = None if opts.external_downloader_args: external_downloader_args = compat_shlex_split(opts.external_downloader_args) @@ -283,6 +306,9 @@ def _real_main(argv=None): 'password': opts.password, 'twofactor': opts.twofactor, 'videopassword': opts.videopassword, + 'ap_mso': opts.ap_mso, + 'ap_username': opts.ap_username, + 'ap_password': opts.ap_password, 'quiet': (opts.quiet or any_getting or any_printing), 'no_warnings': opts.no_warnings, 'forceurl': opts.geturl, @@ -301,6 +327,7 @@ def _real_main(argv=None): 'listformats': opts.listformats, 'outtmpl': outtmpl, 'autonumber_size': opts.autonumber_size, + 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, 'ignoreerrors': opts.ignoreerrors, 'force_generic_extractor': opts.force_generic_extractor, @@ -308,6 +335,7 @@ def _real_main(argv=None): 'nooverwrites': opts.nooverwrites, 'retries': opts.retries, 'fragment_retries': opts.fragment_retries, + 'skip_unavailable_fragments': opts.skip_unavailable_fragments, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, 'continuedl': opts.continue_dl, @@ -316,6 +344,7 @@ def _real_main(argv=None): 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, + 'playlistrandom': opts.playlist_random, 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl == '-', 'consoletitle': opts.consoletitle, @@ -370,6 +399,7 @@ def _real_main(argv=None): 'source_address': opts.source_address, 'call_home': opts.call_home, 'sleep_interval': opts.sleep_interval, + 'max_sleep_interval': opts.max_sleep_interval, 'external_downloader': opts.external_downloader, 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, @@ -383,7 +413,7 @@ def _real_main(argv=None): 'postprocessor_args': postprocessor_args, 'cn_verification_proxy': opts.cn_verification_proxy, 'geo_verification_proxy': opts.geo_verification_proxy, - + 'config_location': opts.config_location, } with YoutubeDL(ydl_opts) as ydl: @@ -427,4 +457,5 @@ def main(argv=None): except KeyboardInterrupt: sys.exit('\nERROR: Interrupted by user') + __all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index a01c367de..b8ff45481 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -174,6 +174,7 @@ def aes_decrypt_text(data, password, key_size_bytes): return plaintext + RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, @@ -328,4 +329,5 @@ def inc(data): break return data + __all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_decrypt_text'] diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b8aaf5a46..718902019 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2344,7 +2344,7 @@ try: from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus except ImportError: # Python 2 _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') - else re.compile('([\x00-\x7f]+)')) + else re.compile(r'([\x00-\x7f]+)')) # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus # implementations from cpython 3.4.3's stdlib. Python 2's version @@ -2491,6 +2491,7 @@ class _TreeBuilder(etree.TreeBuilder): def doctype(self, name, pubid, system): pass + if sys.version_info[0] >= 3: def compat_etree_fromstring(text): return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) @@ -2528,6 +2529,24 @@ else: el.text = el.text.decode('utf-8') return doc +if hasattr(etree, 'register_namespace'): + compat_etree_register_namespace = etree.register_namespace +else: + def compat_etree_register_namespace(prefix, uri): + """Register a namespace prefix. + The registry is global, and any existing mapping for either the + given prefix or the namespace URI will be removed. + *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and + attributes in this namespace will be serialized with prefix if possible. + ValueError is raised if prefix is reserved or is invalid. + """ + if re.match(r"ns\d+$", prefix): + raise ValueError("Prefix format reserved for internal use") + for k, v in list(etree._namespace_map.items()): + if k == uri or v == prefix: + del etree._namespace_map[k] + etree._namespace_map[uri] = prefix + if sys.version_info < (2, 7): # Here comes the crazy part: In 2.6, if the xpath is a unicode, # .//node does not match if a node is a direct child of . ! @@ -2787,6 +2806,7 @@ def workaround_optparse_bug9161(): return real_add_option(self, *bargs, **bkwargs) optparse.OptionGroup.add_option = _compat_add_option + if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3 compat_get_terminal_size = shutil.get_terminal_size else: @@ -2863,6 +2883,7 @@ __all__ = [ 'compat_cookiejar', 'compat_cookies', 'compat_etree_fromstring', + 'compat_etree_register_namespace', 'compat_expanduser', 'compat_get_terminal_size', 'compat_getenv', diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 817591d97..16952e359 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -7,6 +7,7 @@ from .http import HttpFD from .rtmp import RtmpFD from .dash import DashSegmentsFD from .rtsp import RtspFD +from .ism import IsmFD from .external import ( get_external_downloader, FFmpegFD, @@ -24,6 +25,7 @@ PROTOCOL_MAP = { 'rtsp': RtspFD, 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, + 'ism': IsmFD, } diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 1dba9f49a..3dc144b4e 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -4,6 +4,7 @@ import os import re import sys import time +import random from ..compat import compat_os_name from ..utils import ( @@ -342,8 +343,10 @@ class FileDownloader(object): }) return True - sleep_interval = self.params.get('sleep_interval') - if sleep_interval: + min_sleep_interval = self.params.get('sleep_interval') + if min_sleep_interval: + max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) + sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) self.to_screen('[download] Sleeping %s seconds...' % sleep_interval) time.sleep(sleep_interval) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8bbab9dbc..8437dde30 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import os -import re from .fragment import FragmentFD from ..compat import compat_urllib_error @@ -19,32 +18,32 @@ class DashSegmentsFD(FragmentFD): FD_NAME = 'dashsegments' def real_download(self, filename, info_dict): - base_url = info_dict['url'] - segment_urls = [info_dict['segment_urls'][0]] if self.params.get('test', False) else info_dict['segment_urls'] - initialization_url = info_dict.get('initialization_url') + segments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] ctx = { 'filename': filename, - 'total_frags': len(segment_urls) + (1 if initialization_url else 0), + 'total_frags': len(segments), } self._prepare_and_start_frag_download(ctx) - def combine_url(base_url, target_url): - if re.match(r'^https?://', target_url): - return target_url - return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url) - segments_filenames = [] fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - def append_url_to_file(target_url, tmp_filename, segment_name): + def process_segment(segment, tmp_filename, num): + segment_url = segment['url'] + segment_name = 'Frag%d' % num target_filename = '%s-%s' % (tmp_filename, segment_name) + # In DASH, the first segment contains necessary headers to + # generate a valid MP4 file, so always abort for the first segment + fatal = num == 0 or not skip_unavailable_fragments count = 0 while count <= fragment_retries: try: - success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) + success = ctx['dl'].download(target_filename, {'url': segment_url}) if not success: return False down, target_sanitized = sanitize_open(target_filename, 'rb') @@ -52,26 +51,27 @@ class DashSegmentsFD(FragmentFD): down.close() segments_filenames.append(target_sanitized) break - except (compat_urllib_error.HTTPError, ) as err: + except compat_urllib_error.HTTPError as err: # YouTube may often return 404 HTTP error for a fragment causing the # whole download to fail. However if the same fragment is immediately # retried with the same request data this usually succeeds (1-2 attemps # is usually enough) thus allowing to download the whole file successfully. - # So, we will retry all fragments that fail with 404 HTTP error for now. - if err.code != 404: - raise - # Retry fragment + # To be future-proof we will retry all fragments that fail with any + # HTTP error. count += 1 if count <= fragment_retries: - self.report_retry_fragment(segment_name, count, fragment_retries) + self.report_retry_fragment(err, segment_name, count, fragment_retries) if count > fragment_retries: + if not fatal: + self.report_skip_fragment(segment_name) + return True self.report_error('giving up after %s fragment retries' % fragment_retries) return False + return True - if initialization_url: - append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init') - for i, segment_url in enumerate(segment_urls): - append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i) + for i, segment in enumerate(segments): + if not process_segment(segment, ctx['tmpfilename'], i): + return False self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index fae245024..138f353ef 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -17,6 +17,7 @@ from ..utils import ( encodeArgument, handle_youtubedl_headers, check_executable, + is_outdated_version, ) @@ -96,6 +97,12 @@ class CurlFD(ExternalFD): cmd = [self.exe, '--location', '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') + cmd += self._valueless_option('--silent', 'noprogress') + cmd += self._valueless_option('--verbose', 'verbose') + cmd += self._option('--limit-rate', 'ratelimit') + cmd += self._option('--retry', 'retries') + cmd += self._option('--max-filesize', 'max_filesize') cmd += self._option('--interface', 'source_address') cmd += self._option('--proxy', 'proxy') cmd += self._valueless_option('--insecure', 'nocheckcertificate') @@ -103,6 +110,16 @@ class CurlFD(ExternalFD): cmd += ['--', info_dict['url']] return cmd + def _call_downloader(self, tmpfilename, info_dict): + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) + + # curl writes the progress to stderr so don't capture it. + p = subprocess.Popen(cmd) + p.communicate() + return p.returncode + class AxelFD(ExternalFD): AVAILABLE_OPT = '-V' @@ -204,6 +221,12 @@ class FFmpegFD(ExternalFD): if proxy: if not re.match(r'^[\da-zA-Z]+://', proxy): proxy = 'http://%s' % proxy + + if proxy.startswith('socks'): + self.report_warning( + '%s does not support SOCKS proxies. Downloading is likely to fail. ' + 'Consider adding --hls-prefer-native to your command.' % self.get_basename()) + # Since December 2015 ffmpeg supports -http_proxy option (see # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) # We could switch to the following code if we are able to detect version properly @@ -242,7 +265,9 @@ class FFmpegFD(ExternalFD): if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] else: - args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + args += ['-f', 'mp4'] + if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2')) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): + args += ['-bsf:a', 'aac_adtstoasc'] elif protocol == 'rtmp': args += ['-f', 'flv'] else: @@ -271,6 +296,7 @@ class FFmpegFD(ExternalFD): class AVconvFD(FFmpegFD): pass + _BY_NAME = dict( (klass.get_basename(), klass) for name, klass in globals().items() diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 80c21d40b..688e086eb 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -314,7 +314,8 @@ class F4mFD(FragmentFD): man_url = info_dict['url'] requested_bitrate = info_dict.get('tbr') self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) - urlh = self.ydl.urlopen(man_url) + + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) man_url = urlh.geturl() # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244 @@ -387,7 +388,10 @@ class F4mFD(FragmentFD): url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) frag_filename = '%s-%s' % (ctx['tmpfilename'], name) try: - success = ctx['dl'].download(frag_filename, {'url': url_parsed.geturl()}) + success = ctx['dl'].download(frag_filename, { + 'url': url_parsed.geturl(), + 'http_headers': info_dict.get('http_headers'), + }) if not success: return False (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index ba903ae10..60df627a6 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -6,8 +6,10 @@ import time from .common import FileDownloader from .http import HttpFD from ..utils import ( + error_to_compat_str, encodeFilename, sanitize_open, + sanitized_Request, ) @@ -22,13 +24,23 @@ class FragmentFD(FileDownloader): Available options: - fragment_retries: Number of times to retry a fragment for HTTP error (DASH only) + fragment_retries: Number of times to retry a fragment for HTTP error (DASH + and hlsnative only) + skip_unavailable_fragments: + Skip unavailable fragments (DASH and hlsnative only) """ - def report_retry_fragment(self, fragment_name, count, retries): + def report_retry_fragment(self, err, fragment_name, count, retries): self.to_screen( - '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %s)...' - % (fragment_name, count, self.format_retries(retries))) + '[download] Got server HTTP error: %s. Retrying fragment %s (attempt %d of %s)...' + % (error_to_compat_str(err), fragment_name, count, self.format_retries(retries))) + + def report_skip_fragment(self, fragment_name): + self.to_screen('[download] Skipping fragment %s...' % fragment_name) + + def _prepare_url(self, info_dict, url): + headers = info_dict.get('http_headers') + return sanitized_Request(url, None, headers) if headers else url def _prepare_and_start_frag_download(self, ctx): self._prepare_frag_download(ctx) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 3b7bb3508..4989abce1 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -13,6 +13,7 @@ from .fragment import FragmentFD from .external import FFmpegFD from ..compat import ( + compat_urllib_error, compat_urlparse, compat_struct_pack, ) @@ -20,6 +21,7 @@ from ..utils import ( encodeFilename, sanitize_open, parse_m3u8_attributes, + update_url_query, ) @@ -29,7 +31,7 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' @staticmethod - def can_download(manifest): + def can_download(manifest, info_dict): UNSUPPORTED_FEATURES = ( r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1] r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] @@ -51,16 +53,21 @@ class HlsFD(FragmentFD): ) check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest) + check_results.append(not info_dict.get('is_live')) return all(check_results) def real_download(self, filename, info_dict): man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) - manifest = self.ydl.urlopen(man_url).read() + + manifest = self.ydl.urlopen(self._prepare_url(info_dict, man_url)).read() s = manifest.decode('utf-8', 'ignore') - if not self.can_download(s): + if not self.can_download(s, info_dict): + if info_dict.get('extra_param_to_segment_url'): + self.report_error('pycrypto not found. Please install it.') + return False self.report_warning( 'hlsnative has detected features it does not support, ' 'extraction will be delegated to ffmpeg') @@ -82,6 +89,14 @@ class HlsFD(FragmentFD): self._prepare_and_start_frag_download(ctx) + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + test = self.params.get('test', False) + + extra_query = None + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -94,13 +109,40 @@ class HlsFD(FragmentFD): line if re.match(r'^https?://', line) else compat_urlparse.urljoin(man_url, line)) - frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) - success = ctx['dl'].download(frag_filename, {'url': frag_url}) - if not success: + frag_name = 'Frag%d' % i + frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name) + if extra_query: + frag_url = update_url_query(frag_url, extra_query) + count = 0 + while count <= fragment_retries: + try: + success = ctx['dl'].download(frag_filename, { + 'url': frag_url, + 'http_headers': info_dict.get('http_headers'), + }) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + frag_content = down.read() + down.close() + break + except compat_urllib_error.HTTPError as err: + # Unavailable (possibly temporary) fragments may be served. + # First we try to retry then either skip or abort. + # See https://github.com/rg3/youtube-dl/issues/10165, + # https://github.com/rg3/youtube-dl/issues/10448). + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, frag_name, count, fragment_retries) + if count > fragment_retries: + if skip_unavailable_fragments: + i += 1 + media_sequence += 1 + self.report_skip_fragment(frag_name) + continue + self.report_error( + 'giving up after %s fragment retries' % fragment_retries) return False - down, frag_sanitized = sanitize_open(frag_filename, 'rb') - frag_content = down.read() - down.close() if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) frag_content = AES.new( @@ -108,7 +150,7 @@ class HlsFD(FragmentFD): ctx['dest_stream'].write(frag_content) frags_filenames.append(frag_sanitized) # We only download the first fragment during the test - if self.params.get('test', False): + if test: break i += 1 media_sequence += 1 @@ -116,10 +158,12 @@ class HlsFD(FragmentFD): decrypt_info = parse_m3u8_attributes(line[11:]) if decrypt_info['METHOD'] == 'AES-128': if 'IV' in decrypt_info: - decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:]) + decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) if not re.match(r'^https?://', decrypt_info['URI']): decrypt_info['URI'] = compat_urlparse.urljoin( man_url, decrypt_info['URI']) + if extra_query: + decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): media_sequence = int(line[22:]) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index f8b69d186..af405b950 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -13,6 +13,9 @@ from ..utils import ( encodeFilename, sanitize_open, sanitized_Request, + write_xattr, + XAttrMetadataError, + XAttrUnavailableError, ) @@ -179,9 +182,8 @@ class HttpFD(FileDownloader): if self.params.get('xattr_set_filesize', False) and data_len is not None: try: - import xattr - xattr.setxattr(tmpfilename, 'user.ytdl.filesize', str(data_len)) - except(OSError, IOError, ImportError) as err: + write_xattr(tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + except (XAttrUnavailableError, XAttrMetadataError) as err: self.report_error('unable to set filesize xattr: %s' % str(err)) try: diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py new file mode 100644 index 000000000..93cac5e98 --- /dev/null +++ b/youtube_dl/downloader/ism.py @@ -0,0 +1,271 @@ +from __future__ import unicode_literals + +import os +import time +import struct +import binascii +import io + +from .fragment import FragmentFD +from ..compat import compat_urllib_error +from ..utils import ( + sanitize_open, + encodeFilename, +) + + +u8 = struct.Struct(b'>B') +u88 = struct.Struct(b'>Bx') +u16 = struct.Struct(b'>H') +u1616 = struct.Struct(b'>Hxx') +u32 = struct.Struct(b'>I') +u64 = struct.Struct(b'>Q') + +s88 = struct.Struct(b'>bx') +s16 = struct.Struct(b'>h') +s1616 = struct.Struct(b'>hxx') +s32 = struct.Struct(b'>i') + +unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000) + +TRACK_ENABLED = 0x1 +TRACK_IN_MOVIE = 0x2 +TRACK_IN_PREVIEW = 0x4 + +SELF_CONTAINED = 0x1 + + +def box(box_type, payload): + return u32.pack(8 + len(payload)) + box_type + payload + + +def full_box(box_type, version, flags, payload): + return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload) + + +def write_piff_header(stream, params): + track_id = params['track_id'] + fourcc = params['fourcc'] + duration = params['duration'] + timescale = params.get('timescale', 10000000) + language = params.get('language', 'und') + height = params.get('height', 0) + width = params.get('width', 0) + is_audio = width == 0 and height == 0 + creation_time = modification_time = int(time.time()) + + ftyp_payload = b'isml' # major brand + ftyp_payload += u32.pack(1) # minor version + ftyp_payload += b'piff' + b'iso2' # compatible brands + stream.write(box(b'ftyp', ftyp_payload)) # File Type Box + + mvhd_payload = u64.pack(creation_time) + mvhd_payload += u64.pack(modification_time) + mvhd_payload += u32.pack(timescale) + mvhd_payload += u64.pack(duration) + mvhd_payload += s1616.pack(1) # rate + mvhd_payload += s88.pack(1) # volume + mvhd_payload += u16.pack(0) # reserved + mvhd_payload += u32.pack(0) * 2 # reserved + mvhd_payload += unity_matrix + mvhd_payload += u32.pack(0) * 6 # pre defined + mvhd_payload += u32.pack(0xffffffff) # next track id + moov_payload = full_box(b'mvhd', 1, 0, mvhd_payload) # Movie Header Box + + tkhd_payload = u64.pack(creation_time) + tkhd_payload += u64.pack(modification_time) + tkhd_payload += u32.pack(track_id) # track id + tkhd_payload += u32.pack(0) # reserved + tkhd_payload += u64.pack(duration) + tkhd_payload += u32.pack(0) * 2 # reserved + tkhd_payload += s16.pack(0) # layer + tkhd_payload += s16.pack(0) # alternate group + tkhd_payload += s88.pack(1 if is_audio else 0) # volume + tkhd_payload += u16.pack(0) # reserved + tkhd_payload += unity_matrix + tkhd_payload += u1616.pack(width) + tkhd_payload += u1616.pack(height) + trak_payload = full_box(b'tkhd', 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW, tkhd_payload) # Track Header Box + + mdhd_payload = u64.pack(creation_time) + mdhd_payload += u64.pack(modification_time) + mdhd_payload += u32.pack(timescale) + mdhd_payload += u64.pack(duration) + mdhd_payload += u16.pack(((ord(language[0]) - 0x60) << 10) | ((ord(language[1]) - 0x60) << 5) | (ord(language[2]) - 0x60)) + mdhd_payload += u16.pack(0) # pre defined + mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload) # Media Header Box + + hdlr_payload = u32.pack(0) # pre defined + hdlr_payload += b'soun' if is_audio else b'vide' # handler type + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += (b'Sound' if is_audio else b'Video') + b'Handler\0' # name + mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload) # Handler Reference Box + + if is_audio: + smhd_payload = s88.pack(0) # balance + smhd_payload = u16.pack(0) # reserved + media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header + else: + vmhd_payload = u16.pack(0) # graphics mode + vmhd_payload += u16.pack(0) * 3 # opcolor + media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload) # Video Media Header + minf_payload = media_header_box + + dref_payload = u32.pack(1) # entry count + dref_payload += full_box(b'url ', 0, SELF_CONTAINED, b'') # Data Entry URL Box + dinf_payload = full_box(b'dref', 0, 0, dref_payload) # Data Reference Box + minf_payload += box(b'dinf', dinf_payload) # Data Information Box + + stsd_payload = u32.pack(1) # entry count + + sample_entry_payload = u8.pack(0) * 6 # reserved + sample_entry_payload += u16.pack(1) # data reference index + if is_audio: + sample_entry_payload += u32.pack(0) * 2 # reserved + sample_entry_payload += u16.pack(params.get('channels', 2)) + sample_entry_payload += u16.pack(params.get('bits_per_sample', 16)) + sample_entry_payload += u16.pack(0) # pre defined + sample_entry_payload += u16.pack(0) # reserved + sample_entry_payload += u1616.pack(params['sampling_rate']) + + if fourcc == 'AACL': + sample_entry_box = box(b'mp4a', sample_entry_payload) + else: + sample_entry_payload = sample_entry_payload + sample_entry_payload += u16.pack(0) # pre defined + sample_entry_payload += u16.pack(0) # reserved + sample_entry_payload += u32.pack(0) * 3 # pre defined + sample_entry_payload += u16.pack(width) + sample_entry_payload += u16.pack(height) + sample_entry_payload += u1616.pack(0x48) # horiz resolution 72 dpi + sample_entry_payload += u1616.pack(0x48) # vert resolution 72 dpi + sample_entry_payload += u32.pack(0) # reserved + sample_entry_payload += u16.pack(1) # frame count + sample_entry_payload += u8.pack(0) * 32 # compressor name + sample_entry_payload += u16.pack(0x18) # depth + sample_entry_payload += s16.pack(-1) # pre defined + + codec_private_data = binascii.unhexlify(params['codec_private_data']) + if fourcc in ('H264', 'AVC1'): + sps, pps = codec_private_data.split(u32.pack(1))[1:] + avcc_payload = u8.pack(1) # configuration version + avcc_payload += sps[1:4] # avc profile indication + profile compatibility + avc level indication + avcc_payload += u8.pack(0xfc | (params.get('nal_unit_length_field', 4) - 1)) # complete represenation (1) + reserved (11111) + length size minus one + avcc_payload += u8.pack(1) # reserved (0) + number of sps (0000001) + avcc_payload += u16.pack(len(sps)) + avcc_payload += sps + avcc_payload += u8.pack(1) # number of pps + avcc_payload += u16.pack(len(pps)) + avcc_payload += pps + sample_entry_payload += box(b'avcC', avcc_payload) # AVC Decoder Configuration Record + sample_entry_box = box(b'avc1', sample_entry_payload) # AVC Simple Entry + stsd_payload += sample_entry_box + + stbl_payload = full_box(b'stsd', 0, 0, stsd_payload) # Sample Description Box + + stts_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stts', 0, 0, stts_payload) # Decoding Time to Sample Box + + stsc_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stsc', 0, 0, stsc_payload) # Sample To Chunk Box + + stco_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stco', 0, 0, stco_payload) # Chunk Offset Box + + minf_payload += box(b'stbl', stbl_payload) # Sample Table Box + + mdia_payload += box(b'minf', minf_payload) # Media Information Box + + trak_payload += box(b'mdia', mdia_payload) # Media Box + + moov_payload += box(b'trak', trak_payload) # Track Box + + mehd_payload = u64.pack(duration) + mvex_payload = full_box(b'mehd', 1, 0, mehd_payload) # Movie Extends Header Box + + trex_payload = u32.pack(track_id) # track id + trex_payload += u32.pack(1) # default sample description index + trex_payload += u32.pack(0) # default sample duration + trex_payload += u32.pack(0) # default sample size + trex_payload += u32.pack(0) # default sample flags + mvex_payload += full_box(b'trex', 0, 0, trex_payload) # Track Extends Box + + moov_payload += box(b'mvex', mvex_payload) # Movie Extends Box + stream.write(box(b'moov', moov_payload)) # Movie Box + + +def extract_box_data(data, box_sequence): + data_reader = io.BytesIO(data) + while True: + box_size = u32.unpack(data_reader.read(4))[0] + box_type = data_reader.read(4) + if box_type == box_sequence[0]: + box_data = data_reader.read(box_size - 8) + if len(box_sequence) == 1: + return box_data + return extract_box_data(box_data, box_sequence[1:]) + data_reader.seek(box_size - 8, 1) + + +class IsmFD(FragmentFD): + """ + Download segments in a ISM manifest + """ + + FD_NAME = 'ism' + + def real_download(self, filename, info_dict): + segments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] + + ctx = { + 'filename': filename, + 'total_frags': len(segments), + } + + self._prepare_and_start_frag_download(ctx) + + segments_filenames = [] + + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + + track_written = False + for i, segment in enumerate(segments): + segment_url = segment['url'] + segment_name = 'Frag%d' % i + target_filename = '%s-%s' % (ctx['tmpfilename'], segment_name) + count = 0 + while count <= fragment_retries: + try: + success = ctx['dl'].download(target_filename, {'url': segment_url}) + if not success: + return False + down, target_sanitized = sanitize_open(target_filename, 'rb') + down_data = down.read() + if not track_written: + tfhd_data = extract_box_data(down_data, [b'moof', b'traf', b'tfhd']) + info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0] + write_piff_header(ctx['dest_stream'], info_dict['_download_params']) + track_written = True + ctx['dest_stream'].write(down_data) + down.close() + segments_filenames.append(target_sanitized) + break + except compat_urllib_error.HTTPError as err: + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, segment_name, count, fragment_retries) + if count > fragment_retries: + if skip_unavailable_fragments: + self.report_skip_fragment(segment_name) + continue + self.report_error('giving up after %s fragment retries' % fragment_retries) + return False + + self._finish_frag_download(ctx) + + for segment_file in segments_filenames: + os.remove(encodeFilename(segment_file)) + + return True diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index b584277be..0247cabf9 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -7,12 +7,13 @@ from ..utils import ( ExtractorError, js_to_json, int_or_none, + parse_iso8601, ) class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'https?://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', @@ -93,3 +94,59 @@ class ABCIE(InfoExtractor): 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), } + + +class ABCIViewIE(InfoExtractor): + IE_NAME = 'abc.net.au:iview' + _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + + # ABC iview programs are normally available for 14 days only. + _TESTS = [{ + 'url': 'http://iview.abc.net.au/programs/diaries-of-a-broken-mind/ZX9735A001S00', + 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', + 'info_dict': { + 'id': 'ZX9735A001S00', + 'ext': 'mp4', + 'title': 'Diaries Of A Broken Mind', + 'description': 'md5:7de3903874b7a1be279fe6b68718fc9e', + 'upload_date': '20161010', + 'uploader_id': 'abc2', + 'timestamp': 1476064920, + }, + 'skip': 'Video gone', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_params = self._parse_json(self._search_regex( + r'videoParams\s*=\s*({.+?});', webpage, 'video params'), video_id) + title = video_params.get('title') or video_params['seriesTitle'] + stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') + + formats = self._extract_akamai_formats(stream['hds-unmetered'], video_id) + self._sort_formats(formats) + + subtitles = {} + src_vtt = stream.get('captions', {}).get('src-vtt') + if src_vtt: + subtitles['en'] = [{ + 'url': src_vtt, + 'ext': 'vtt', + }] + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), + 'duration': int_or_none(video_params.get('eventDuration')), + 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), + 'series': video_params.get('seriesTitle'), + 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], + 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage, default=None)), + 'episode': self._html_search_meta('episode_title', webpage, default=None), + 'uploader_id': video_params.get('channel'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index b61a6327c..4f56c4c11 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -12,7 +12,7 @@ from ..compat import compat_urlparse class AbcNewsVideoIE(AMPIE): IE_NAME = 'abcnews:video' - _VALID_URL = 'http://abcnews.go.com/[^/]+/video/(?P[0-9a-z-]+)-(?P\d+)' + _VALID_URL = r'https?://abcnews\.go\.com/[^/]+/video/(?P[0-9a-z-]+)-(?P\d+)' _TESTS = [{ 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', @@ -23,7 +23,7 @@ class AbcNewsVideoIE(AMPIE): 'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif', 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', 'duration': 180, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { # m3u8 download @@ -49,7 +49,7 @@ class AbcNewsVideoIE(AMPIE): class AbcNewsIE(InfoExtractor): IE_NAME = 'abcnews' - _VALID_URL = 'https?://abcnews\.go\.com/(?:[^/]+/)+(?P[0-9a-z-]+)/story\?id=(?P\d+)' + _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P[0-9a-z-]+)/story\?id=(?P\d+)' _TESTS = [{ 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', @@ -59,7 +59,7 @@ class AbcNewsIE(InfoExtractor): 'display_id': 'dramatic-video-rare-death-job-america', 'title': 'Occupational Hazards', 'description': 'Nightline investigates the dangers that lurk at various jobs.', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20100428', 'timestamp': 1272412800, }, diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abcotvs.py similarity index 51% rename from youtube_dl/extractor/abc7news.py rename to youtube_dl/extractor/abcotvs.py index c04949c21..76e98132b 100644 --- a/youtube_dl/extractor/abc7news.py +++ b/youtube_dl/extractor/abcotvs.py @@ -1,13 +1,19 @@ +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + int_or_none, + parse_iso8601, +) -class Abc7NewsIE(InfoExtractor): - _VALID_URL = r'https?://abc7news\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' +class ABCOTVSIE(InfoExtractor): + IE_NAME = 'abcotvs' + IE_DESC = 'ABC Owned Television Stations' + _VALID_URL = r'https?://(?:abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' _TESTS = [ { 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', @@ -15,9 +21,9 @@ class Abc7NewsIE(InfoExtractor): 'id': '472581', 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', 'ext': 'mp4', - 'title': 'East Bay museum celebrates history of synthesized music', + 'title': 'East Bay museum celebrates vintage synthesizers', 'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1421123075, 'upload_date': '20150113', 'uploader': 'Jonathan Bloom', @@ -41,7 +47,7 @@ class Abc7NewsIE(InfoExtractor): webpage = self._download_webpage(url, display_id) m3u8 = self._html_search_meta( - 'contentURL', webpage, 'm3u8 url', fatal=True) + 'contentURL', webpage, 'm3u8 url', fatal=True).split('?')[0] formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') self._sort_formats(formats) @@ -66,3 +72,41 @@ class Abc7NewsIE(InfoExtractor): 'uploader': uploader, 'formats': formats, } + + +class ABCOTVSClipsIE(InfoExtractor): + IE_NAME = 'abcotvs:clips' + _VALID_URL = r'https?://clips\.abcotvs\.com/(?:[^/]+/)*video/(?P\d+)' + _TEST = { + 'url': 'https://clips.abcotvs.com/kabc/video/214814', + 'info_dict': { + 'id': '214814', + 'ext': 'mp4', + 'title': 'SpaceX launch pad explosion destroys rocket, satellite', + 'description': 'md5:9f186e5ad8f490f65409965ee9c7be1b', + 'upload_date': '20160901', + 'timestamp': 1472756695, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json('https://clips.abcotvs.com/vogo/video/getByIds?ids=' + video_id, video_id)['results'][0] + title = video_data['title'] + formats = self._extract_m3u8_formats( + video_data['videoURL'].split('?')[0], video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailURL'), + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('pubDate')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 94ce88c83..6dace3051 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -8,6 +8,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + parse_iso8601, OnDemandPagedList, ) @@ -15,18 +16,33 @@ from ..utils import ( class ACastIE(InfoExtractor): IE_NAME = 'acast' _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P[^/]+)/(?P[^/#?]+)' - _TEST = { + _TESTS = [{ + # test with one bling 'url': 'https://www.acast.com/condenasttraveler/-where-are-you-taipei-101-taiwan', 'md5': 'ada3de5a1e3a2a381327d749854788bb', 'info_dict': { 'id': '57de3baa-4bb0-487e-9418-2692c1277a34', 'ext': 'mp3', 'title': '"Where Are You?": Taipei 101, Taiwan', - 'timestamp': 1196172000000, + 'timestamp': 1196172000, + 'upload_date': '20071127', 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', 'duration': 211, } - } + }, { + # test with multiple blings + 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', + 'md5': '55c0097badd7095f494c99a172f86501', + 'info_dict': { + 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', + 'ext': 'mp3', + 'title': '2. Raggarmordet - Röster ur det förflutna', + 'timestamp': 1477346700, + 'upload_date': '20161024', + 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', + 'duration': 2797, + } + }] def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() @@ -35,11 +51,11 @@ class ACastIE(InfoExtractor): return { 'id': compat_str(cast_data['id']), 'display_id': display_id, - 'url': cast_data['blings'][0]['audio'], + 'url': [b['audio'] for b in cast_data['blings'] if b['type'] == 'BlingAudio'][0], 'title': cast_data['name'], 'description': cast_data.get('description'), 'thumbnail': cast_data.get('image'), - 'timestamp': int_or_none(cast_data.get('publishingDate')), + 'timestamp': parse_iso8601(cast_data.get('publishingDate')), 'duration': int_or_none(cast_data.get('duration')), } diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py new file mode 100644 index 000000000..12eeab271 --- /dev/null +++ b/youtube_dl/extractor/adobepass.py @@ -0,0 +1,1472 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import time +import xml.etree.ElementTree as etree + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + unescapeHTML, + urlencode_postdata, + unified_timestamp, + ExtractorError, +) + + +MSO_INFO = { + 'DTV': { + 'name': 'DIRECTV', + 'username_field': 'username', + 'password_field': 'password', + }, + 'Rogers': { + 'name': 'Rogers', + 'username_field': 'UserName', + 'password_field': 'UserPassword', + }, + 'Comcast_SSO': { + 'name': 'Comcast XFINITY', + 'username_field': 'user', + 'password_field': 'passwd', + }, + 'thr030': { + 'name': '3 Rivers Communications' + }, + 'com140': { + 'name': 'Access Montana' + }, + 'acecommunications': { + 'name': 'AcenTek' + }, + 'acm010': { + 'name': 'Acme Communications' + }, + 'ada020': { + 'name': 'Adams Cable Service' + }, + 'alb020': { + 'name': 'Albany Mutual Telephone' + }, + 'algona': { + 'name': 'Algona Municipal Utilities' + }, + 'allwest': { + 'name': 'All West Communications' + }, + 'all025': { + 'name': 'Allen\'s Communications' + }, + 'spl010': { + 'name': 'Alliance Communications' + }, + 'all070': { + 'name': 'ALLO Communications' + }, + 'alpine': { + 'name': 'Alpine Communications' + }, + 'hun015': { + 'name': 'American Broadband' + }, + 'nwc010': { + 'name': 'American Broadband Missouri' + }, + 'com130-02': { + 'name': 'American Community Networks' + }, + 'com130-01': { + 'name': 'American Warrior Networks' + }, + 'tom020': { + 'name': 'Amherst Telephone/Tomorrow Valley' + }, + 'tvc020': { + 'name': 'Andycable' + }, + 'arkwest': { + 'name': 'Arkwest Communications' + }, + 'art030': { + 'name': 'Arthur Mutual Telephone Company' + }, + 'arvig': { + 'name': 'Arvig' + }, + 'nttcash010': { + 'name': 'Ashland Home Net' + }, + 'astound': { + 'name': 'Astound (now Wave)' + }, + 'dix030': { + 'name': 'ATC Broadband' + }, + 'ara010': { + 'name': 'ATC Communications' + }, + 'she030-02': { + 'name': 'Ayersville Communications' + }, + 'baldwin': { + 'name': 'Baldwin Lightstream' + }, + 'bal040': { + 'name': 'Ballard TV' + }, + 'cit025': { + 'name': 'Bardstown Cable TV' + }, + 'bay030': { + 'name': 'Bay Country Communications' + }, + 'tel095': { + 'name': 'Beaver Creek Cooperative Telephone' + }, + 'bea020': { + 'name': 'Beaver Valley Cable' + }, + 'bee010': { + 'name': 'Bee Line Cable' + }, + 'wir030': { + 'name': 'Beehive Broadband' + }, + 'bra020': { + 'name': 'BELD' + }, + 'bel020': { + 'name': 'Bellevue Municipal Cable' + }, + 'vol040-01': { + 'name': 'Ben Lomand Connect / BLTV' + }, + 'bev010': { + 'name': 'BEVCOMM' + }, + 'big020': { + 'name': 'Big Sandy Broadband' + }, + 'ble020': { + 'name': 'Bledsoe Telephone Cooperative' + }, + 'bvt010': { + 'name': 'Blue Valley Tele-Communications' + }, + 'bra050': { + 'name': 'Brandenburg Telephone Co.' + }, + 'bte010': { + 'name': 'Bristol Tennessee Essential Services' + }, + 'annearundel': { + 'name': 'Broadstripe' + }, + 'btc010': { + 'name': 'BTC Communications' + }, + 'btc040': { + 'name': 'BTC Vision - Nahunta' + }, + 'bul010': { + 'name': 'Bulloch Telephone Cooperative' + }, + 'but010': { + 'name': 'Butler-Bremer Communications' + }, + 'tel160-csp': { + 'name': 'C Spire SNAP' + }, + 'csicable': { + 'name': 'Cable Services Inc.' + }, + 'cableamerica': { + 'name': 'CableAmerica' + }, + 'cab038': { + 'name': 'CableSouth Media 3' + }, + 'weh010-camtel': { + 'name': 'Cam-Tel Company' + }, + 'car030': { + 'name': 'Cameron Communications' + }, + 'canbytel': { + 'name': 'Canby Telcom' + }, + 'crt020': { + 'name': 'CapRock Tv' + }, + 'car050': { + 'name': 'Carnegie Cable' + }, + 'cas': { + 'name': 'CAS Cable' + }, + 'casscomm': { + 'name': 'CASSCOMM' + }, + 'mid180-02': { + 'name': 'Catalina Broadband Solutions' + }, + 'cccomm': { + 'name': 'CC Communications' + }, + 'nttccde010': { + 'name': 'CDE Lightband' + }, + 'cfunet': { + 'name': 'Cedar Falls Utilities' + }, + 'dem010-01': { + 'name': 'Celect-Bloomer Telephone Area' + }, + 'dem010-02': { + 'name': 'Celect-Bruce Telephone Area' + }, + 'dem010-03': { + 'name': 'Celect-Citizens Connected Area' + }, + 'dem010-04': { + 'name': 'Celect-Elmwood/Spring Valley Area' + }, + 'dem010-06': { + 'name': 'Celect-Mosaic Telecom' + }, + 'dem010-05': { + 'name': 'Celect-West WI Telephone Area' + }, + 'net010-02': { + 'name': 'Cellcom/Nsight Telservices' + }, + 'cen100': { + 'name': 'CentraCom' + }, + 'nttccst010': { + 'name': 'Central Scott / CSTV' + }, + 'cha035': { + 'name': 'Chaparral CableVision' + }, + 'cha050': { + 'name': 'Chariton Valley Communication Corporation, Inc.' + }, + 'cha060': { + 'name': 'Chatmoss Cablevision' + }, + 'nttcche010': { + 'name': 'Cherokee Communications' + }, + 'che050': { + 'name': 'Chesapeake Bay Communications' + }, + 'cimtel': { + 'name': 'Cim-Tel Cable, LLC.' + }, + 'cit180': { + 'name': 'Citizens Cablevision - Floyd, VA' + }, + 'cit210': { + 'name': 'Citizens Cablevision, Inc.' + }, + 'cit040': { + 'name': 'Citizens Fiber' + }, + 'cit250': { + 'name': 'Citizens Mutual' + }, + 'war040': { + 'name': 'Citizens Telephone Corporation' + }, + 'wat025': { + 'name': 'City Of Monroe' + }, + 'wadsworth': { + 'name': 'CityLink' + }, + 'nor100': { + 'name': 'CL Tel' + }, + 'cla010': { + 'name': 'Clarence Telephone and Cedar Communications' + }, + 'ser060': { + 'name': 'Clear Choice Communications' + }, + 'tac020': { + 'name': 'Click! Cable TV' + }, + 'war020': { + 'name': 'CLICK1.NET' + }, + 'cml010': { + 'name': 'CML Telephone Cooperative Association' + }, + 'cns': { + 'name': 'CNS' + }, + 'com160': { + 'name': 'Co-Mo Connect' + }, + 'coa020': { + 'name': 'Coast Communications' + }, + 'coa030': { + 'name': 'Coaxial Cable TV' + }, + 'mid055': { + 'name': 'Cobalt TV (Mid-State Community TV)' + }, + 'col070': { + 'name': 'Columbia Power & Water Systems' + }, + 'col080': { + 'name': 'Columbus Telephone' + }, + 'nor105': { + 'name': 'Communications 1 Cablevision, Inc.' + }, + 'com150': { + 'name': 'Community Cable & Broadband' + }, + 'com020': { + 'name': 'Community Communications Company' + }, + 'coy010': { + 'name': 'commZoom' + }, + 'com025': { + 'name': 'Complete Communication Services' + }, + 'cat020': { + 'name': 'Comporium' + }, + 'com071': { + 'name': 'ComSouth Telesys' + }, + 'consolidatedcable': { + 'name': 'Consolidated' + }, + 'conwaycorp': { + 'name': 'Conway Corporation' + }, + 'coo050': { + 'name': 'Coon Valley Telecommunications Inc' + }, + 'coo080': { + 'name': 'Cooperative Telephone Company' + }, + 'cpt010': { + 'name': 'CP-TEL' + }, + 'cra010': { + 'name': 'Craw-Kan Telephone' + }, + 'crestview': { + 'name': 'Crestview Cable Communications' + }, + 'cross': { + 'name': 'Cross TV' + }, + 'cro030': { + 'name': 'Crosslake Communications' + }, + 'ctc040': { + 'name': 'CTC - Brainerd MN' + }, + 'phe030': { + 'name': 'CTV-Beam - East Alabama' + }, + 'cun010': { + 'name': 'Cunningham Telephone & Cable' + }, + 'dpc010': { + 'name': 'D & P Communications' + }, + 'dak030': { + 'name': 'Dakota Central Telecommunications' + }, + 'nttcdel010': { + 'name': 'Delcambre Telephone LLC' + }, + 'tel160-del': { + 'name': 'Delta Telephone Company' + }, + 'sal040': { + 'name': 'DiamondNet' + }, + 'ind060-dc': { + 'name': 'Direct Communications' + }, + 'doy010': { + 'name': 'Doylestown Cable TV' + }, + 'dic010': { + 'name': 'DRN' + }, + 'dtc020': { + 'name': 'DTC' + }, + 'dtc010': { + 'name': 'DTC Cable (Delhi)' + }, + 'dum010': { + 'name': 'Dumont Telephone Company' + }, + 'dun010': { + 'name': 'Dunkerton Telephone Cooperative' + }, + 'cci010': { + 'name': 'Duo County Telecom' + }, + 'eagle': { + 'name': 'Eagle Communications' + }, + 'weh010-east': { + 'name': 'East Arkansas Cable TV' + }, + 'eatel': { + 'name': 'EATEL Video, LLC' + }, + 'ell010': { + 'name': 'ECTA' + }, + 'emerytelcom': { + 'name': 'Emery Telcom Video LLC' + }, + 'nor200': { + 'name': 'Empire Access' + }, + 'endeavor': { + 'name': 'Endeavor Communications' + }, + 'sun045': { + 'name': 'Enhanced Telecommunications Corporation' + }, + 'mid030': { + 'name': 'enTouch' + }, + 'epb020': { + 'name': 'EPB Smartnet' + }, + 'jea010': { + 'name': 'EPlus Broadband' + }, + 'com065': { + 'name': 'ETC' + }, + 'ete010': { + 'name': 'Etex Communications' + }, + 'fbc-tele': { + 'name': 'F&B Communications' + }, + 'fal010': { + 'name': 'Falcon Broadband' + }, + 'fam010': { + 'name': 'FamilyView CableVision' + }, + 'far020': { + 'name': 'Farmers Mutual Telephone Company' + }, + 'fay010': { + 'name': 'Fayetteville Public Utilities' + }, + 'sal060': { + 'name': 'fibrant' + }, + 'fid010': { + 'name': 'Fidelity Communications' + }, + 'for030': { + 'name': 'FJ Communications' + }, + 'fli020': { + 'name': 'Flint River Communications' + }, + 'far030': { + 'name': 'FMT - Jesup' + }, + 'foo010': { + 'name': 'Foothills Communications' + }, + 'for080': { + 'name': 'Forsyth CableNet' + }, + 'fbcomm': { + 'name': 'Frankfort Plant Board' + }, + 'tel160-fra': { + 'name': 'Franklin Telephone Company' + }, + 'nttcftc010': { + 'name': 'FTC' + }, + 'fullchannel': { + 'name': 'Full Channel, Inc.' + }, + 'gar040': { + 'name': 'Gardonville Cooperative Telephone Association' + }, + 'gbt010': { + 'name': 'GBT Communications, Inc.' + }, + 'tec010': { + 'name': 'Genuine Telecom' + }, + 'clr010': { + 'name': 'Giant Communications' + }, + 'gla010': { + 'name': 'Glasgow EPB' + }, + 'gle010': { + 'name': 'Glenwood Telecommunications' + }, + 'gra060': { + 'name': 'GLW Broadband Inc.' + }, + 'goldenwest': { + 'name': 'Golden West Cablevision' + }, + 'vis030': { + 'name': 'Grantsburg Telcom' + }, + 'gpcom': { + 'name': 'Great Plains Communications' + }, + 'gri010': { + 'name': 'Gridley Cable Inc' + }, + 'hbc010': { + 'name': 'H&B Cable Services' + }, + 'hae010': { + 'name': 'Haefele TV Inc.' + }, + 'htc010': { + 'name': 'Halstad Telephone Company' + }, + 'har005': { + 'name': 'Harlan Municipal Utilities' + }, + 'har020': { + 'name': 'Hart Communications' + }, + 'ced010': { + 'name': 'Hartelco TV' + }, + 'hea040': { + 'name': 'Heart of Iowa Communications Cooperative' + }, + 'htc020': { + 'name': 'Hickory Telephone Company' + }, + 'nttchig010': { + 'name': 'Highland Communication Services' + }, + 'hig030': { + 'name': 'Highland Media' + }, + 'spc010': { + 'name': 'Hilliary Communications' + }, + 'hin020': { + 'name': 'Hinton CATV Co.' + }, + 'hometel': { + 'name': 'HomeTel Entertainment, Inc.' + }, + 'hoodcanal': { + 'name': 'Hood Canal Communications' + }, + 'weh010-hope': { + 'name': 'Hope - Prescott Cable TV' + }, + 'horizoncable': { + 'name': 'Horizon Cable TV, Inc.' + }, + 'hor040': { + 'name': 'Horizon Chillicothe Telephone' + }, + 'htc030': { + 'name': 'HTC Communications Co. - IL' + }, + 'htccomm': { + 'name': 'HTC Communications, Inc. - IA' + }, + 'wal005': { + 'name': 'Huxley Communications' + }, + 'imon': { + 'name': 'ImOn Communications' + }, + 'ind040': { + 'name': 'Independence Telecommunications' + }, + 'rrc010': { + 'name': 'Inland Networks' + }, + 'stc020': { + 'name': 'Innovative Cable TV St Croix' + }, + 'car100': { + 'name': 'Innovative Cable TV St Thomas-St John' + }, + 'icc010': { + 'name': 'Inside Connect Cable' + }, + 'int100': { + 'name': 'Integra Telecom' + }, + 'int050': { + 'name': 'Interstate Telecommunications Coop' + }, + 'irv010': { + 'name': 'Irvine Cable' + }, + 'k2c010': { + 'name': 'K2 Communications' + }, + 'kal010': { + 'name': 'Kalida Telephone Company, Inc.' + }, + 'kal030': { + 'name': 'Kalona Cooperative Telephone Company' + }, + 'kmt010': { + 'name': 'KMTelecom' + }, + 'kpu010': { + 'name': 'KPU Telecommunications' + }, + 'kuh010': { + 'name': 'Kuhn Communications, Inc.' + }, + 'lak130': { + 'name': 'Lakeland Communications' + }, + 'lan010': { + 'name': 'Langco' + }, + 'lau020': { + 'name': 'Laurel Highland Total Communications, Inc.' + }, + 'leh010': { + 'name': 'Lehigh Valley Cooperative Telephone' + }, + 'bra010': { + 'name': 'Limestone Cable/Bracken Cable' + }, + 'loc020': { + 'name': 'LISCO' + }, + 'lit020': { + 'name': 'Litestream' + }, + 'tel140': { + 'name': 'LivCom' + }, + 'loc010': { + 'name': 'LocalTel Communications' + }, + 'weh010-longview': { + 'name': 'Longview - Kilgore Cable TV' + }, + 'lon030': { + 'name': 'Lonsdale Video Ventures, LLC' + }, + 'lns010': { + 'name': 'Lost Nation-Elwood Telephone Co.' + }, + 'nttclpc010': { + 'name': 'LPC Connect' + }, + 'lumos': { + 'name': 'Lumos Networks' + }, + 'madison': { + 'name': 'Madison Communications' + }, + 'mad030': { + 'name': 'Madison County Cable Inc.' + }, + 'nttcmah010': { + 'name': 'Mahaska Communication Group' + }, + 'mar010': { + 'name': 'Marne & Elk Horn Telephone Company' + }, + 'mcc040': { + 'name': 'McClure Telephone Co.' + }, + 'mctv': { + 'name': 'MCTV' + }, + 'merrimac': { + 'name': 'Merrimac Communications Ltd.' + }, + 'metronet': { + 'name': 'Metronet' + }, + 'mhtc': { + 'name': 'MHTC' + }, + 'midhudson': { + 'name': 'Mid-Hudson Cable' + }, + 'midrivers': { + 'name': 'Mid-Rivers Communications' + }, + 'mid045': { + 'name': 'Midstate Communications' + }, + 'mil080': { + 'name': 'Milford Communications' + }, + 'min030': { + 'name': 'MINET' + }, + 'nttcmin010': { + 'name': 'Minford TV' + }, + 'san040-02': { + 'name': 'Mitchell Telecom' + }, + 'mlg010': { + 'name': 'MLGC' + }, + 'mon060': { + 'name': 'Mon-Cre TVE' + }, + 'mou110': { + 'name': 'Mountain Telephone' + }, + 'mou050': { + 'name': 'Mountain Village Cable' + }, + 'mtacomm': { + 'name': 'MTA Communications, LLC' + }, + 'mtc010': { + 'name': 'MTC Cable' + }, + 'med040': { + 'name': 'MTC Technologies' + }, + 'man060': { + 'name': 'MTCC' + }, + 'mtc030': { + 'name': 'MTCO Communications' + }, + 'mul050': { + 'name': 'Mulberry Telecommunications' + }, + 'mur010': { + 'name': 'Murray Electric System' + }, + 'musfiber': { + 'name': 'MUS FiberNET' + }, + 'mpw': { + 'name': 'Muscatine Power & Water' + }, + 'nttcsli010': { + 'name': 'myEVTV.com' + }, + 'nor115': { + 'name': 'NCC' + }, + 'nor260': { + 'name': 'NDTC' + }, + 'nctc': { + 'name': 'Nebraska Central Telecom, Inc.' + }, + 'nel020': { + 'name': 'Nelsonville TV Cable' + }, + 'nem010': { + 'name': 'Nemont' + }, + 'new075': { + 'name': 'New Hope Telephone Cooperative' + }, + 'nor240': { + 'name': 'NICP' + }, + 'cic010': { + 'name': 'NineStar Connect' + }, + 'nktelco': { + 'name': 'NKTelco' + }, + 'nortex': { + 'name': 'Nortex Communications' + }, + 'nor140': { + 'name': 'North Central Telephone Cooperative' + }, + 'nor030': { + 'name': 'Northland Communications' + }, + 'nor075': { + 'name': 'Northwest Communications' + }, + 'nor125': { + 'name': 'Norwood Light Broadband' + }, + 'net010': { + 'name': 'Nsight Telservices' + }, + 'dur010': { + 'name': 'Ntec' + }, + 'nts010': { + 'name': 'NTS Communications' + }, + 'new045': { + 'name': 'NU-Telecom' + }, + 'nulink': { + 'name': 'NuLink' + }, + 'jam030': { + 'name': 'NVC' + }, + 'far035': { + 'name': 'OmniTel Communications' + }, + 'onesource': { + 'name': 'OneSource Communications' + }, + 'cit230': { + 'name': 'Opelika Power Services' + }, + 'daltonutilities': { + 'name': 'OptiLink' + }, + 'mid140': { + 'name': 'OPTURA' + }, + 'ote010': { + 'name': 'OTEC Communication Company' + }, + 'cci020': { + 'name': 'Packerland Broadband' + }, + 'pan010': { + 'name': 'Panora Telco/Guthrie Center Communications' + }, + 'otter': { + 'name': 'Park Region Telephone & Otter Tail Telcom' + }, + 'mid050': { + 'name': 'Partner Communications Cooperative' + }, + 'fib010': { + 'name': 'Pathway' + }, + 'paulbunyan': { + 'name': 'Paul Bunyan Communications' + }, + 'pem020': { + 'name': 'Pembroke Telephone Company' + }, + 'mck010': { + 'name': 'Peoples Rural Telephone Cooperative' + }, + 'pul010': { + 'name': 'PES Energize' + }, + 'phi010': { + 'name': 'Philippi Communications System' + }, + 'phonoscope': { + 'name': 'Phonoscope Cable' + }, + 'pin070': { + 'name': 'Pine Belt Communications, Inc.' + }, + 'weh010-pine': { + 'name': 'Pine Bluff Cable TV' + }, + 'pin060': { + 'name': 'Pineland Telephone Cooperative' + }, + 'cam010': { + 'name': 'Pinpoint Communications' + }, + 'pio060': { + 'name': 'Pioneer Broadband' + }, + 'pioncomm': { + 'name': 'Pioneer Communications' + }, + 'pioneer': { + 'name': 'Pioneer DTV' + }, + 'pla020': { + 'name': 'Plant TiftNet, Inc.' + }, + 'par010': { + 'name': 'PLWC' + }, + 'pro035': { + 'name': 'PMT' + }, + 'vik011': { + 'name': 'Polar Cablevision' + }, + 'pottawatomie': { + 'name': 'Pottawatomie Telephone Co.' + }, + 'premiercomm': { + 'name': 'Premier Communications' + }, + 'psc010': { + 'name': 'PSC' + }, + 'pan020': { + 'name': 'PTCI' + }, + 'qco010': { + 'name': 'QCOL' + }, + 'qua010': { + 'name': 'Quality Cablevision' + }, + 'rad010': { + 'name': 'Radcliffe Telephone Company' + }, + 'car040': { + 'name': 'Rainbow Communications' + }, + 'rai030': { + 'name': 'Rainier Connect' + }, + 'ral010': { + 'name': 'Ralls Technologies' + }, + 'rct010': { + 'name': 'RC Technologies' + }, + 'red040': { + 'name': 'Red River Communications' + }, + 'ree010': { + 'name': 'Reedsburg Utility Commission' + }, + 'mol010': { + 'name': 'Reliance Connects- Oregon' + }, + 'res020': { + 'name': 'Reserve Telecommunications' + }, + 'weh010-resort': { + 'name': 'Resort TV Cable' + }, + 'rld010': { + 'name': 'Richland Grant Telephone Cooperative, Inc.' + }, + 'riv030': { + 'name': 'River Valley Telecommunications Coop' + }, + 'rockportcable': { + 'name': 'Rock Port Cablevision' + }, + 'rsf010': { + 'name': 'RS Fiber' + }, + 'rtc': { + 'name': 'RTC Communication Corp' + }, + 'res040': { + 'name': 'RTC-Reservation Telephone Coop.' + }, + 'rte010': { + 'name': 'RTEC Communications' + }, + 'stc010': { + 'name': 'S&T' + }, + 'san020': { + 'name': 'San Bruno Cable TV' + }, + 'san040-01': { + 'name': 'Santel' + }, + 'sav010': { + 'name': 'SCI Broadband-Savage Communications Inc.' + }, + 'sco050': { + 'name': 'Scottsboro Electric Power Board' + }, + 'scr010': { + 'name': 'Scranton Telephone Company' + }, + 'selco': { + 'name': 'SELCO' + }, + 'she010': { + 'name': 'Shentel' + }, + 'she030': { + 'name': 'Sherwood Mutual Telephone Association, Inc.' + }, + 'ind060-ssc': { + 'name': 'Silver Star Communications' + }, + 'sjoberg': { + 'name': 'Sjoberg\'s Inc.' + }, + 'sou025': { + 'name': 'SKT' + }, + 'sky050': { + 'name': 'SkyBest TV' + }, + 'nttcsmi010': { + 'name': 'Smithville Communications' + }, + 'woo010': { + 'name': 'Solarus' + }, + 'sou075': { + 'name': 'South Central Rural Telephone Cooperative' + }, + 'sou065': { + 'name': 'South Holt Cablevision, Inc.' + }, + 'sou035': { + 'name': 'South Slope Cooperative Communications' + }, + 'spa020': { + 'name': 'Spanish Fork Community Network' + }, + 'spe010': { + 'name': 'Spencer Municipal Utilities' + }, + 'spi005': { + 'name': 'Spillway Communications, Inc.' + }, + 'srt010': { + 'name': 'SRT' + }, + 'cccsmc010': { + 'name': 'St. Maarten Cable TV' + }, + 'sta025': { + 'name': 'Star Communications' + }, + 'sco020': { + 'name': 'STE' + }, + 'uin010': { + 'name': 'STRATA Networks' + }, + 'sum010': { + 'name': 'Sumner Cable TV' + }, + 'pie010': { + 'name': 'Surry TV/PCSI TV' + }, + 'swa010': { + 'name': 'Swayzee Communications' + }, + 'sweetwater': { + 'name': 'Sweetwater Cable Television Co' + }, + 'weh010-talequah': { + 'name': 'Tahlequah Cable TV' + }, + 'tct': { + 'name': 'TCT' + }, + 'tel050': { + 'name': 'Tele-Media Company' + }, + 'com050': { + 'name': 'The Community Agency' + }, + 'thr020': { + 'name': 'Three River' + }, + 'cab140': { + 'name': 'Town & Country Technologies' + }, + 'tra010': { + 'name': 'Trans-Video' + }, + 'tre010': { + 'name': 'Trenton TV Cable Company' + }, + 'tcc': { + 'name': 'Tri County Communications Cooperative' + }, + 'tri025': { + 'name': 'TriCounty Telecom' + }, + 'tri110': { + 'name': 'TrioTel Communications, Inc.' + }, + 'tro010': { + 'name': 'Troy Cablevision, Inc.' + }, + 'tsc': { + 'name': 'TSC' + }, + 'cit220': { + 'name': 'Tullahoma Utilities Board' + }, + 'tvc030': { + 'name': 'TV Cable of Rensselaer' + }, + 'tvc015': { + 'name': 'TVC Cable' + }, + 'cab180': { + 'name': 'TVision' + }, + 'twi040': { + 'name': 'Twin Lakes' + }, + 'tvtinc': { + 'name': 'Twin Valley' + }, + 'uis010': { + 'name': 'Union Telephone Company' + }, + 'uni110': { + 'name': 'United Communications - TN' + }, + 'uni120': { + 'name': 'United Services' + }, + 'uss020': { + 'name': 'US Sonet' + }, + 'cab060': { + 'name': 'USA Communications' + }, + 'she005': { + 'name': 'USA Communications/Shellsburg, IA' + }, + 'val040': { + 'name': 'Valley TeleCom Group' + }, + 'val025': { + 'name': 'Valley Telecommunications' + }, + 'val030': { + 'name': 'Valparaiso Broadband' + }, + 'cla050': { + 'name': 'Vast Broadband' + }, + 'sul015': { + 'name': 'Venture Communications Cooperative, Inc.' + }, + 'ver025': { + 'name': 'Vernon Communications Co-op' + }, + 'weh010-vicksburg': { + 'name': 'Vicksburg Video' + }, + 'vis070': { + 'name': 'Vision Communications' + }, + 'volcanotel': { + 'name': 'Volcano Vision, Inc.' + }, + 'vol040-02': { + 'name': 'VolFirst / BLTV' + }, + 'ver070': { + 'name': 'VTel' + }, + 'nttcvtx010': { + 'name': 'VTX1' + }, + 'bci010-02': { + 'name': 'Vyve Broadband' + }, + 'wab020': { + 'name': 'Wabash Mutual Telephone' + }, + 'waitsfield': { + 'name': 'Waitsfield Cable' + }, + 'wal010': { + 'name': 'Walnut Communications' + }, + 'wavebroadband': { + 'name': 'Wave' + }, + 'wav030': { + 'name': 'Waverly Communications Utility' + }, + 'wbi010': { + 'name': 'WBI' + }, + 'web020': { + 'name': 'Webster-Calhoun Cooperative Telephone Association' + }, + 'wes005': { + 'name': 'West Alabama TV Cable' + }, + 'carolinata': { + 'name': 'West Carolina Communications' + }, + 'wct010': { + 'name': 'West Central Telephone Association' + }, + 'wes110': { + 'name': 'West River Cooperative Telephone Company' + }, + 'ani030': { + 'name': 'WesTel Systems' + }, + 'westianet': { + 'name': 'Western Iowa Networks' + }, + 'nttcwhi010': { + 'name': 'Whidbey Telecom' + }, + 'weh010-white': { + 'name': 'White County Cable TV' + }, + 'wes130': { + 'name': 'Wiatel' + }, + 'wik010': { + 'name': 'Wiktel' + }, + 'wil070': { + 'name': 'Wilkes Communications, Inc./RiverStreet Networks' + }, + 'wil015': { + 'name': 'Wilson Communications' + }, + 'win010': { + 'name': 'Windomnet/SMBS' + }, + 'win090': { + 'name': 'Windstream Cable TV' + }, + 'wcta': { + 'name': 'Winnebago Cooperative Telecom Association' + }, + 'wtc010': { + 'name': 'WTC' + }, + 'wil040': { + 'name': 'WTC Communications, Inc.' + }, + 'wya010': { + 'name': 'Wyandotte Cable' + }, + 'hin020-02': { + 'name': 'X-Stream Services' + }, + 'xit010': { + 'name': 'XIT Communications' + }, + 'yel010': { + 'name': 'Yelcot Communications' + }, + 'mid180-01': { + 'name': 'yondoo' + }, + 'cou060': { + 'name': 'Zito Media' + }, +} + + +class AdobePassIE(InfoExtractor): + _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' + _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' + _MVPD_CACHE = 'ap-mvpd' + + @staticmethod + def _get_mvpd_resource(provider_id, title, guid, rating): + channel = etree.Element('channel') + channel_title = etree.SubElement(channel, 'title') + channel_title.text = provider_id + item = etree.SubElement(channel, 'item') + resource_title = etree.SubElement(item, 'title') + resource_title.text = title + resource_guid = etree.SubElement(item, 'guid') + resource_guid.text = guid + resource_rating = etree.SubElement(item, 'media:rating') + resource_rating.attrib = {'scheme': 'urn:v-chip'} + resource_rating.text = rating + return '' + etree.tostring(channel).decode() + '' + + def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + def xml_text(xml_str, tag): + return self._search_regex( + '<%s>(.+?)' % (tag, tag), xml_str, tag) + + def is_expired(token, date_ele): + token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) + return token_expires and token_expires <= int(time.time()) + + def post_form(form_page_res, note, data={}): + form_page, urlh = form_page_res + post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') + if not re.match(r'https?://', post_url): + post_url = compat_urlparse.urljoin(urlh.geturl(), post_url) + form_data = self._hidden_inputs(form_page) + form_data.update(data) + return self._download_webpage_handle( + post_url, video_id, note, data=urlencode_postdata(form_data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + def raise_mvpd_required(): + raise ExtractorError( + 'This video is only available for users of participating TV providers. ' + 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier ' + 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) + + mvpd_headers = { + 'ap_42': 'anonymous', + 'ap_11': 'Linux i686', + 'ap_z': self._USER_AGENT, + 'User-Agent': self._USER_AGENT, + } + + guid = xml_text(resource, 'guid') if '<' in resource else resource + count = 0 + while count < 2: + requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {} + authn_token = requestor_info.get('authn_token') + if authn_token and is_expired(authn_token, 'simpleTokenExpires'): + authn_token = None + if not authn_token: + # TODO add support for other TV Providers + mso_id = self._downloader.params.get('ap_mso') + if not mso_id: + raise_mvpd_required() + username, password = self._get_login_info('ap_username', 'ap_password', mso_id) + if not username or not password: + raise_mvpd_required() + mso_info = MSO_INFO[mso_id] + + provider_redirect_page_res = self._download_webpage_handle( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + }) + + if mso_id == 'Comcast_SSO': + # Comcast page flow varies by video site and whether you + # are on Comcast's network. + provider_redirect_page, urlh = provider_redirect_page_res + # Check for Comcast auto login + if 'automatically signing you in' in provider_redirect_page: + oauth_redirect_url = self._html_search_regex( + r'window\.location\s*=\s*[\'"]([^\'"]+)', + provider_redirect_page, 'oauth redirect') + # Just need to process the request. No useful data comes back + self._download_webpage( + oauth_redirect_url, video_id, 'Confirming auto login') + else: + if '
Resume' in mvpd_confirm_page: + post_form(mvpd_confirm_page_res, 'Confirming Login') + + else: + # Normal, non-Comcast flow + provider_login_page_res = post_form( + provider_redirect_page_res, 'Downloading Provider Login Page') + mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { + mso_info.get('username_field', 'username'): username, + mso_info.get('password_field', 'password'): password, + }) + if mso_id != 'Rogers': + post_form(mvpd_confirm_page_res, 'Confirming Login') + + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + if 'playlists/)?(?P[^/]+)/(?P[^/?#]+)/?' _TESTS = [{ @@ -96,7 +94,29 @@ class AdultSwimIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'http://www.adultswim.com/videos/toonami/friday-october-14th-2016/', + 'info_dict': { + 'id': 'eYiLsKVgQ6qTC6agD67Sig', + 'title': 'Toonami - Friday, October 14th, 2016', + 'description': 'md5:99892c96ffc85e159a428de85c30acde', + }, + 'playlist': [{ + 'md5': '', + 'info_dict': { + 'id': 'eYiLsKVgQ6qTC6agD67Sig', + 'ext': 'mp4', + 'title': 'Toonami - Friday, October 14th, 2016', + 'description': 'md5:99892c96ffc85e159a428de85c30acde', + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], }] @staticmethod @@ -148,7 +168,10 @@ class AdultSwimIE(InfoExtractor): if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: video_info = bootstrapped_data['slugged_video'] if not video_info: - video_info = bootstrapped_data.get('heroMetadata', {}).get('trailer').get('video') + video_info = bootstrapped_data.get( + 'heroMetadata', {}).get('trailer', {}).get('video') + if not video_info: + video_info = bootstrapped_data.get('onlineOriginals', [None])[0] if not video_info: raise ExtractorError('Unable to find video info') @@ -161,71 +184,41 @@ class AdultSwimIE(InfoExtractor): segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] elif video_info.get('videoPlaybackID'): segment_ids = [video_info['videoPlaybackID']] + elif video_info.get('id'): + segment_ids = [video_info['id']] else: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.' - if video_info.get('auth') is True else 'Unable to find stream or clips', - expected=True) + if video_info.get('auth') is True: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.', expected=True) + else: + raise ExtractorError('Unable to find stream or clips') episode_id = video_info['id'] episode_title = video_info['title'] - episode_description = video_info['description'] - episode_duration = video_info.get('duration') + episode_description = video_info.get('description') + episode_duration = int_or_none(video_info.get('duration')) + view_count = int_or_none(video_info.get('views')) entries = [] for part_num, segment_id in enumerate(segment_ids): - segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id - + segement_info = self._extract_cvp_info( + 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id, + segment_id, { + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', + 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', + }, + }) segment_title = '%s - %s' % (show_title, episode_title) if len(segment_ids) > 1: segment_title += ' Part %d' % (part_num + 1) - - idoc = self._download_xml( - segment_url, segment_title, - 'Downloading segment information', 'Unable to download segment information') - - segment_duration = float_or_none( - xpath_text(idoc, './/trt', 'segment duration').strip()) - - formats = [] - file_els = idoc.findall('.//files/file') or idoc.findall('./files/file') - - unique_urls = [] - unique_file_els = [] - for file_el in file_els: - media_url = file_el.text - if not media_url or determine_ext(media_url) == 'f4m': - continue - if file_el.text not in unique_urls: - unique_urls.append(file_el.text) - unique_file_els.append(file_el) - - for file_el in unique_file_els: - bitrate = file_el.attrib.get('bitrate') - ftype = file_el.attrib.get('type') - media_url = file_el.text - if determine_ext(media_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, segment_title, 'mp4', preference=0, - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': '%s_%s' % (bitrate, ftype), - 'url': file_el.text.strip(), - # The bitrate may not be a number (for example: 'iphone') - 'tbr': int(bitrate) if bitrate.isdigit() else None, - }) - - self._sort_formats(formats) - - entries.append({ + segement_info.update({ 'id': segment_id, 'title': segment_title, - 'formats': formats, - 'duration': segment_duration, - 'description': episode_description + 'description': episode_description, }) + entries.append(segement_info) return { '_type': 'playlist', @@ -234,5 +227,6 @@ class AdultSwimIE(InfoExtractor): 'entries': entries, 'title': '%s - %s' % (show_title, episode_title), 'description': episode_description, - 'duration': episode_duration + 'duration': episode_duration, + 'view_count': view_count, } diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 8f53050c9..c97317400 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -26,7 +26,7 @@ class AENetworksIE(AENetworksBaseIE): _VALID_URL = r'https?://(?:www\.)?(?P(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P[^/]+(?:/[^/]+){0,2})|movies/(?P[^/]+)/full-movie)' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', - 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', + 'md5': 'a97a65f7e823ae10e9244bc5433d5fe6', 'info_dict': { 'id': '22253814', 'ext': 'mp4', @@ -87,7 +87,7 @@ class AENetworksIE(AENetworksBaseIE): self._html_search_meta('aetn:SeriesTitle', webpage)) elif url_parts_len == 2: entries = [] - for episode_item in re.findall(r'(?s)]+class="[^"]*episode-item[^"]*"[^>]*>', webpage): + for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): episode_attributes = extract_attributes(episode_item) episode_url = compat_urlparse.urljoin( url, episode_attributes['data-canonical']) @@ -99,7 +99,7 @@ class AENetworksIE(AENetworksBaseIE): query = { 'mbr': 'true', - 'assetTypes': 'medium_video_s3' + 'assetTypes': 'high_video_s3' } video_id = self._html_search_meta('aetn:VideoID', webpage) media_url = self._search_regex( @@ -109,7 +109,10 @@ class AENetworksIE(AENetworksBaseIE): info = self._parse_theplatform_metadata(theplatform_metadata) if theplatform_metadata.get('AETN$isBehindWall'): requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] - resource = '%s%s%s%s' % (requestor_id, theplatform_metadata['title'], theplatform_metadata['AETN$PPL_pplProgramId'], theplatform_metadata['ratings'][0]['rating']) + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) query['auth'] = self._extract_mvpd_auth( url, video_id, requestor_id, resource) info.update(self._search_json_ld(webpage, video_id, fatal=False)) @@ -152,7 +155,7 @@ class HistoryTopicIE(AENetworksBaseIE): 'id': 'world-war-i-history', 'title': 'World War I History', }, - 'playlist_mincount': 24, + 'playlist_mincount': 23, }, { 'url': 'http://www.history.com/topics/world-war-i-history/videos', 'only_matching': True, @@ -190,7 +193,8 @@ class HistoryTopicIE(AENetworksBaseIE): return self.theplatform_url_result( release_url, video_id, { 'mbr': 'true', - 'switch': 'hls' + 'switch': 'hls', + 'assetTypes': 'high_video_ak', }) else: webpage = self._download_webpage(url, topic_id) @@ -200,6 +204,7 @@ class HistoryTopicIE(AENetworksBaseIE): entries.append(self.theplatform_url_result( video_attributes['data-release-url'], video_attributes['data-id'], { 'mbr': 'true', - 'switch': 'hls' + 'switch': 'hls', + 'assetTypes': 'high_video_ak', })) return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage)) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 518c61f67..4f6cdb8a2 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -11,19 +11,27 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + update_url_query, xpath_element, xpath_text, ) class AfreecaTVIE(InfoExtractor): + IE_NAME = 'afreecatv' IE_DESC = 'afreecatv.com' - _VALID_URL = r'''(?x)^ - https?://(?:(live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)? - (?: - /app/(?:index|read_ucc_bbs)\.cgi| - /player/[Pp]layer\.(?:swf|html)) - \?.*?\bnTitleNo=(?P\d+)''' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)? + (?: + /app/(?:index|read_ucc_bbs)\.cgi| + /player/[Pp]layer\.(?:swf|html) + )\?.*?\bnTitleNo=| + vod\.afreecatv\.com/PLAYER/STATION/ + ) + (?P\d+) + ''' _TESTS = [{ 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', @@ -66,6 +74,9 @@ class AfreecaTVIE(InfoExtractor): }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', 'only_matching': True, + }, { + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', + 'only_matching': True, }] @staticmethod @@ -83,7 +94,9 @@ class AfreecaTVIE(InfoExtractor): info_url = compat_urlparse.urlunparse(parsed_url._replace( netloc='afbbs.afreecatv.com:8080', path='/api/video/get_video_info.php')) - video_xml = self._download_xml(info_url, video_id) + + video_xml = self._download_xml( + update_url_query(info_url, {'nTitleNo': video_id}), video_id) if xpath_element(video_xml, './track/video/file') is None: raise ExtractorError('Specified AfreecaTV video does not exist', @@ -131,3 +144,94 @@ class AfreecaTVIE(InfoExtractor): expected=True) return info + + +class AfreecaTVGlobalIE(AfreecaTVIE): + IE_NAME = 'afreecatv:global' + _VALID_URL = r'https?://(?:www\.)?afreeca\.tv/(?P\d+)(?:/v/(?P\d+))?' + _TESTS = [{ + 'url': 'http://afreeca.tv/36853014/v/58301', + 'info_dict': { + 'id': '58301', + 'title': 'tryhard top100', + 'uploader_id': '36853014', + 'uploader': 'makgi Hearthstone Live!', + }, + 'playlist_count': 3, + }] + + def _real_extract(self, url): + channel_id, video_id = re.match(self._VALID_URL, url).groups() + video_type = 'video' if video_id else 'live' + query = { + 'pt': 'view', + 'bid': channel_id, + } + if video_id: + query['vno'] = video_id + video_data = self._download_json( + 'http://api.afreeca.tv/%s/view_%s.php' % (video_type, video_type), + video_id or channel_id, query=query)['channel'] + + if video_data.get('result') != 1: + raise ExtractorError('%s said: %s' % (self.IE_NAME, video_data['remsg'])) + + title = video_data['title'] + + info = { + 'thumbnail': video_data.get('thumb'), + 'view_count': int_or_none(video_data.get('vcnt')), + 'age_limit': int_or_none(video_data.get('grade')), + 'uploader_id': channel_id, + 'uploader': video_data.get('cname'), + } + + if video_id: + entries = [] + for i, f in enumerate(video_data.get('flist', [])): + video_key = self.parse_video_key(f.get('key', '')) + f_url = f.get('file') + if not video_key or not f_url: + continue + entries.append({ + 'id': '%s_%s' % (video_id, video_key.get('part', i + 1)), + 'title': title, + 'upload_date': video_key.get('upload_date'), + 'duration': int_or_none(f.get('length')), + 'url': f_url, + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + + info.update({ + 'id': video_id, + 'title': title, + 'duration': int_or_none(video_data.get('length')), + }) + if len(entries) > 1: + info['_type'] = 'multi_video' + info['entries'] = entries + elif len(entries) == 1: + i = entries[0].copy() + i.update(info) + info = i + else: + formats = [] + for s in video_data.get('strm', []): + s_url = s.get('purl') + if not s_url: + continue + # TODO: extract rtmp formats + if s.get('stype') == 'HLS': + formats.extend(self._extract_m3u8_formats( + s_url, channel_id, 'mp4', fatal=False)) + self._sort_formats(formats) + + info.update({ + 'id': channel_id, + 'title': self._live_title(title), + 'is_live': True, + 'formats': formats, + }) + + return info diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py deleted file mode 100644 index 5766b4fe8..000000000 --- a/youtube_dl/extractor/aftonbladet.py +++ /dev/null @@ -1,64 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import int_or_none - - -class AftonbladetIE(InfoExtractor): - _VALID_URL = r'https?://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' - _TEST = { - 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', - 'info_dict': { - 'id': '36015', - 'ext': 'mp4', - 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', - 'description': 'Jupiters måne mest aktiv av alla himlakroppar', - 'timestamp': 1394142732, - 'upload_date': '20140306', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - # find internal video meta data - meta_url = 'http://aftonbladet-play-metadata.cdn.drvideo.aptoma.no/video/%s.json' - player_config = self._parse_json(self._html_search_regex( - r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) - internal_meta_id = player_config['aptomaVideoId'] - internal_meta_url = meta_url % internal_meta_id - internal_meta_json = self._download_json( - internal_meta_url, video_id, 'Downloading video meta data') - - # find internal video formats - format_url = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s' - internal_video_id = internal_meta_json['videoId'] - internal_formats_url = format_url % internal_video_id - internal_formats_json = self._download_json( - internal_formats_url, video_id, 'Downloading video formats') - - formats = [] - for fmt in internal_formats_json['formats']['http']['pseudostreaming']['mp4']: - p = fmt['paths'][0] - formats.append({ - 'url': 'http://%s:%d/%s/%s' % (p['address'], p['port'], p['path'], p['filename']), - 'ext': 'mp4', - 'width': int_or_none(fmt.get('width')), - 'height': int_or_none(fmt.get('height')), - 'tbr': int_or_none(fmt.get('bitrate')), - 'protocol': 'http', - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': internal_meta_json['title'], - 'formats': formats, - 'thumbnail': internal_meta_json.get('imageUrl'), - 'description': internal_meta_json.get('shortPreamble'), - 'timestamp': int_or_none(internal_meta_json.get('timePublished')), - 'duration': int_or_none(internal_meta_json.get('duration')), - 'view_count': int_or_none(internal_meta_json.get('views')), - } diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py index f8e70f4e5..0e0691879 100644 --- a/youtube_dl/extractor/airmozilla.py +++ b/youtube_dl/extractor/airmozilla.py @@ -20,7 +20,7 @@ class AirMozillaIE(InfoExtractor): 'id': '6x4q2w', 'ext': 'mp4', 'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco', - 'thumbnail': 're:https?://vid\.ly/(?P[0-9a-z-]+)/poster', + 'thumbnail': r're:https?://vid\.ly/(?P[0-9a-z-]+)/poster', 'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...', 'timestamp': 1422487800, 'upload_date': '20150128', diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index b081695d8..388e578d5 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://www\.aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 190bc2cc8..90f11d39f 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -1,94 +1,109 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( + remove_end, qualities, - unescapeHTML, - xpath_element, + url_basename, ) class AllocineIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?Particle|video|film)/(fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=|video-)(?P[0-9]+)(?:\.html)?' + _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?:article|video|film)/(?:fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=|video-)(?P[0-9]+)(?:\.html)?' _TESTS = [{ 'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html', 'md5': '0c9fcf59a841f65635fa300ac43d8269', 'info_dict': { 'id': '19546517', + 'display_id': '18635087', 'ext': 'mp4', 'title': 'Astérix - Le Domaine des Dieux Teaser VF', - 'description': 'md5:abcd09ce503c6560512c14ebfdb720d2', - 'thumbnail': 're:http://.*\.jpg', + 'description': 'md5:4a754271d9c6f16c72629a8a993ee884', + 'thumbnail': r're:http://.*\.jpg', }, }, { 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html', 'md5': 'd0cdce5d2b9522ce279fdfec07ff16e0', 'info_dict': { 'id': '19540403', + 'display_id': '19540403', 'ext': 'mp4', 'title': 'Planes 2 Bande-annonce VF', 'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, }, { - 'url': 'http://www.allocine.fr/film/fichefilm_gen_cfilm=181290.html', + 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19544709&cfilm=181290.html', 'md5': '101250fb127ef9ca3d73186ff22a47ce', 'info_dict': { 'id': '19544709', + 'display_id': '19544709', 'ext': 'mp4', 'title': 'Dragons 2 - Bande annonce finale VF', - 'description': 'md5:601d15393ac40f249648ef000720e7e3', - 'thumbnail': 're:http://.*\.jpg', + 'description': 'md5:6cdd2d7c2687d4c6aafe80a35e17267a', + 'thumbnail': r're:http://.*\.jpg', }, }, { 'url': 'http://www.allocine.fr/video/video-19550147/', - 'only_matching': True, + 'md5': '3566c0668c0235e2d224fd8edb389f67', + 'info_dict': { + 'id': '19550147', + 'ext': 'mp4', + 'title': 'Faux Raccord N°123 - Les gaffes de Cliffhanger', + 'description': 'md5:bc734b83ffa2d8a12188d9eb48bb6354', + 'thumbnail': r're:http://.*\.jpg', + }, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - typ = mobj.group('typ') - display_id = mobj.group('id') + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - if typ == 'film': - video_id = self._search_regex(r'href="/video/player_gen_cmedia=([0-9]+).+"', webpage, 'video id') - else: - player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player', default=None) - if player: - player_data = json.loads(player) - video_id = compat_str(player_data['refMedia']) - else: - model = self._search_regex(r'data-model="([^"]+)">', webpage, 'data model') - model_data = self._parse_json(unescapeHTML(model), display_id) - video_id = compat_str(model_data['id']) - - xml = self._download_xml('http://www.allocine.fr/ws/AcVisiondataV4.ashx?media=%s' % video_id, display_id) - - video = xpath_element(xml, './/AcVisionVideo').attrib + formats = [] quality = qualities(['ld', 'md', 'hd']) - formats = [] - for k, v in video.items(): - if re.match(r'.+_path', k): - format_id = k.split('_')[0] + model = self._html_search_regex( + r'data-model="([^"]+)"', webpage, 'data model', default=None) + if model: + model_data = self._parse_json(model, display_id) + + for video_url in model_data['sources'].values(): + video_id, format_id = url_basename(video_url).split('_')[:2] formats.append({ 'format_id': format_id, 'quality': quality(format_id), - 'url': v, + 'url': video_url, }) + + title = model_data['title'] + else: + video_id = display_id + media_data = self._download_json( + 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) + for key, value in media_data['video'].items(): + if not key.endswith('Path'): + continue + + format_id = key[:-len('Path')] + formats.append({ + 'format_id': format_id, + 'quality': quality(format_id), + 'url': value, + }) + + title = remove_end(self._html_search_regex( + r'(?s)(.+?)', webpage, 'title' + ).strip(), ' - AlloCiné') + self._sort_formats(formats) return { 'id': video_id, - 'title': video['videoTitle'], + 'display_id': display_id, + 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'description': self._og_search_description(webpage), diff --git a/youtube_dl/extractor/alphaporno.py b/youtube_dl/extractor/alphaporno.py index c34719d1f..3a6d99f6b 100644 --- a/youtube_dl/extractor/alphaporno.py +++ b/youtube_dl/extractor/alphaporno.py @@ -19,7 +19,7 @@ class AlphaPornoIE(InfoExtractor): 'display_id': 'sensual-striptease-porn-with-samantha-alexandra', 'ext': 'mp4', 'title': 'Sensual striptease porn with Samantha Alexandra', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'timestamp': 1418694611, 'upload_date': '20141216', 'duration': 387, diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py new file mode 100644 index 000000000..87c803e94 --- /dev/null +++ b/youtube_dl/extractor/amcnetworks.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .theplatform import ThePlatformIE +from ..utils import ( + update_url_query, + parse_age_limit, + int_or_none, +) + + +class AMCNetworksIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|wetv)\.com/(?:movies/|shows/[^/]+/(?:full-episodes/)?[^/]+/episode-\d+(?:-(?:[^/]+/)?|/))(?P[^/?#]+)' + _TESTS = [{ + 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', + 'md5': '', + 'info_dict': { + 'id': 's3MX01Nl4vPH', + 'ext': 'mp4', + 'title': 'Maron - Season 4 - Step 1', + 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', + 'age_limit': 17, + 'upload_date': '20160505', + 'timestamp': 1462468831, + 'uploader': 'AMCN', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Requires TV provider accounts', + }, { + 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', + 'only_matching': True, + }, { + 'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal', + 'only_matching': True, + }, { + 'url': 'http://www.ifc.com/movies/chaos', + 'only_matching': True, + }, { + 'url': 'http://www.bbcamerica.com/shows/doctor-who/full-episodes/the-power-of-the-daleks/episode-01-episode-1-color-version', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + media_url = self._search_regex(r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', webpage, 'media url') + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), display_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + video_id = theplatform_metadata['pid'] + title = theplatform_metadata['title'] + rating = theplatform_metadata['ratings'][0]['rating'] + auth_required = self._search_regex(r'window\.authRequired\s*=\s*(true|false);', webpage, 'auth required') + if auth_required == 'true': + requestor_id = self._search_regex(r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', webpage, 'requestor id') + resource = self._get_mvpd_resource(requestor_id, title, video_id, rating) + query['auth'] = self._extract_mvpd_auth(url, video_id, requestor_id, resource) + media_url = update_url_query(media_url, query) + formats, subtitles = self._extract_theplatform_smil(media_url, video_id) + self._sort_formats(formats) + info.update({ + 'id': video_id, + 'subtitles': subtitles, + 'formats': formats, + 'age_limit': parse_age_limit(parse_age_limit(rating)), + }) + ns_keys = theplatform_metadata.get('$xmlns', {}).keys() + if ns_keys: + ns = list(ns_keys)[0] + series = theplatform_metadata.get(ns + '$show') + season_number = int_or_none(theplatform_metadata.get(ns + '$season')) + episode = theplatform_metadata.get(ns + '$episodeTitle') + episode_number = int_or_none(theplatform_metadata.get(ns + '$episode')) + if season_number: + title = 'Season %d - %s' % (season_number, title) + if series: + title = '%s - %s' % (series, title) + info.update({ + 'title': title, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + return info diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index cb29cf111..623f44dce 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -157,22 +157,16 @@ class AnvatoIE(InfoExtractor): video_data_url, video_id, transform_source=strip_jsonp, data=json.dumps(payload).encode('utf-8')) - def _extract_anvato_videos(self, webpage, video_id): - anvplayer_data = self._parse_json(self._html_search_regex( - r']+data-anvp=\'([^\']+)\'', webpage, - 'Anvato player data'), video_id) - - video_id = anvplayer_data['video'] - access_key = anvplayer_data['accessKey'] - + def _get_anvato_videos(self, access_key, video_id): video_data = self._get_video_json(access_key, video_id) formats = [] for published_url in video_data['published_urls']: video_url = published_url['embed_url'] + media_format = published_url.get('format') ext = determine_ext(video_url) - if ext == 'smil': + if ext == 'smil' or media_format == 'smil': formats.extend(self._extract_smil_formats(video_url, video_id)) continue @@ -183,7 +177,7 @@ class AnvatoIE(InfoExtractor): 'tbr': tbr if tbr != 0 else None, } - if ext == 'm3u8': + if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'): # Not using _extract_m3u8_formats here as individual media # playlists are also included in published_urls. if tbr is None: @@ -194,7 +188,7 @@ class AnvatoIE(InfoExtractor): 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), 'ext': 'mp4', }) - elif ext == 'mp3': + elif ext == 'mp3' or media_format == 'mp3': a_format['vcodec'] = 'none' else: a_format.update({ @@ -218,7 +212,19 @@ class AnvatoIE(InfoExtractor): 'formats': formats, 'title': video_data.get('def_title'), 'description': video_data.get('def_description'), + 'tags': video_data.get('def_tags', '').split(','), 'categories': video_data.get('categories'), 'thumbnail': video_data.get('thumbnail'), + 'timestamp': int_or_none(video_data.get( + 'ts_published') or video_data.get('ts_added')), + 'uploader': video_data.get('mcp_id'), + 'duration': int_or_none(video_data.get('duration')), 'subtitles': subtitles, } + + def _extract_anvato_videos(self, webpage, video_id): + anvplayer_data = self._parse_json(self._html_search_regex( + r']+data-anvp=\'([^\']+)\'', webpage, + 'Anvato player data'), video_id) + return self._get_anvato_videos( + anvplayer_data['accessKey'], anvplayer_data['video']) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index 42c21bf41..b50f454ee 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -12,7 +12,7 @@ from ..utils import ( class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/(?:[^/]+/)*(?:[^/?#&]+-)?)(?P[^/?#&]+)' + _VALID_URL = r'(?:aol-video:|https?://(?:(?:www|on)\.)?aol\.com/(?:[^/]+/)*(?:[^/?#&]+-)?)(?P[^/?#&]+)' _TESTS = [{ # video with 5min ID @@ -33,7 +33,7 @@ class AolIE(InfoExtractor): } }, { # video with vidible ID - 'url': 'http://on.aol.com/video/netflix-is-raising-rates-5707d6b8e4b090497b04f706?context=PC:homepage:PL1944:1460189336183', + 'url': 'http://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/', 'info_dict': { 'id': '5707d6b8e4b090497b04f706', 'ext': 'mp4', @@ -108,26 +108,3 @@ class AolIE(InfoExtractor): 'uploader': video_data.get('videoOwner'), 'formats': formats, } - - -class AolFeaturesIE(InfoExtractor): - IE_NAME = 'features.aol.com' - _VALID_URL = r'https?://features\.aol\.com/video/(?P[^/?#]+)' - - _TESTS = [{ - 'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts', - 'md5': '7db483bb0c09c85e241f84a34238cc75', - 'info_dict': { - 'id': '519507715', - 'ext': 'mp4', - 'title': 'What To Watch - February 17, 2016', - }, - 'add_ie': ['FiveMin'], - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - return self.url_result(self._search_regex( - r'', - webpage, 'info json', flags=re.DOTALL)) + _DEFAULT_BITRATES = (48, 150, 320, 496, 864, 2240, 3264) - youtube_id = info.get('youtubeId') + def _real_extract(self, url): + site, display_id, video_id = re.match(self._VALID_URL, url).groups() + + if not video_id: + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + (r'src=["\']/embed/(\d+)', r'data-video-content-id=["\'](\d+)'), + webpage, 'video id') + + webpage = self._download_webpage( + 'http://www.%s.com/embed/%s' % (site, video_id), + display_id, 'Downloading video embed page') + embed_vars = self._parse_json( + self._search_regex( + r'(?s)embedVars\s*=\s*({.+?})\s*', webpage, 'embed vars'), + display_id) + + youtube_id = embed_vars.get('youtubeId') if youtube_id: return self.url_result(youtube_id, 'Youtube') - formats = [{ - 'url': media['uri'] + '?' + info['AuthToken'], - 'tbr': media['bitRate'], - 'width': media['width'], - 'height': media['height'], - } for media in info['media'] if media.get('mediaPurpose') == 'play'] + title = embed_vars['contentName'] - if not formats: + formats = [] + bitrates = [] + for f in embed_vars.get('media', []): + if not f.get('uri') or f.get('mediaPurpose') != 'play': + continue + bitrate = int_or_none(f.get('bitRate')) + if bitrate: + bitrates.append(bitrate) formats.append({ - 'url': info['videoUri'] + 'url': f['uri'], + 'format_id': 'http-%d' % bitrate if bitrate else 'http', + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'tbr': bitrate, + 'format': 'mp4', }) - self._sort_formats(formats) + if not bitrates: + # When subscriptionLevel > 0, i.e. plus subscription is required + # media list will be empty. However, hds and hls uris are still + # available. We can grab them assuming bitrates to be default. + bitrates = self._DEFAULT_BITRATES - duration = int_or_none(info.get('videoLengthInSeconds')) - age_limit = parse_age_limit(info.get('audienceRating')) + auth_token = embed_vars.get('AuthToken') + + def construct_manifest_url(base_url, ext): + pieces = [base_url] + pieces.extend([compat_str(b) for b in bitrates]) + pieces.append('_kbps.mp4.%s?%s' % (ext, auth_token)) + return ','.join(pieces) + + if bitrates and auth_token: + hds_url = embed_vars.get('hdsUri') + if hds_url: + formats.extend(self._extract_f4m_formats( + construct_manifest_url(hds_url, 'f4m'), + display_id, f4m_id='hds', fatal=False)) + hls_url = embed_vars.get('hlsUri') + if hls_url: + formats.extend(self._extract_m3u8_formats( + construct_manifest_url(hls_url, 'm3u8'), + display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) return { 'id': video_id, - 'title': info['contentName'], - 'thumbnail': info['thumbUri'], - 'duration': duration, - 'age_limit': age_limit, + 'display_id': display_id, + 'title': title, + 'thumbnail': embed_vars.get('thumbUri'), + 'duration': int_or_none(embed_vars.get('videoLengthInSeconds')) or None, + 'age_limit': parse_age_limit(embed_vars.get('audienceRating')), + 'tags': embed_vars.get('tags', '').split(','), 'formats': formats, } diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index aeb22be16..5c6e99da1 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -179,7 +179,7 @@ class BrightcoveLegacyIE(InfoExtractor): params = {} - playerID = find_param('playerID') + playerID = find_param('playerID') or find_param('playerId') if playerID is None: raise ExtractorError('Cannot find player ID') params['playerID'] = playerID @@ -204,7 +204,7 @@ class BrightcoveLegacyIE(InfoExtractor): # // build Brightcove XML # } m = re.search( - r'''(?x)customBC.\createVideo\( + r'''(?x)customBC\.createVideo\( .*? # skipping width and height ["\'](?P\d+)["\']\s*,\s* # playerID ["\'](?PAQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters @@ -232,13 +232,16 @@ class BrightcoveLegacyIE(InfoExtractor): """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( - r']+ + content=([\'"])(?Phttps?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2 + ''', webpage) if url_m: - url = unescapeHTML(url_m.group(1)) + url = unescapeHTML(url_m.group('url')) # Some sites don't add it, we can't download with this url, for example: # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ - if 'playerKey' in url or 'videoId' in url: + if 'playerKey' in url or 'videoId' in url or 'idVideo' in url: return [url] matches = re.findall( @@ -259,7 +262,7 @@ class BrightcoveLegacyIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url, {}) # Change the 'videoId' and others field to '@videoPlayer' - url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url) + url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url) # Change bckey (used by bcove.me urls) to playerKey url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) mobj = re.match(self._VALID_URL, url) @@ -548,7 +551,7 @@ class BrightcoveNewIE(InfoExtractor): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - if ext == 'ism': + if ext == 'ism' or container == 'WVM': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -621,15 +624,21 @@ class BrightcoveNewIE(InfoExtractor): 'url': text_track['src'], }) + is_live = False + duration = float_or_none(json_data.get('duration'), 1000) + if duration and duration < 0: + is_live = True + return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': clean_html(json_data.get('description')), 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), - 'duration': float_or_none(json_data.get('duration'), 1000), + 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': account_id, 'formats': formats, 'subtitles': subtitles, 'tags': json_data.get('tags', []), + 'is_live': is_live, } diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 3aec601f8..8ef089653 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor @@ -8,44 +7,87 @@ from ..utils import ExtractorError class BYUtvIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P[^/?#]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?byutv\.org/watch/(?!event/)(?P[0-9a-f-]+)(?:/(?P[^/?#&]+))?' + _TESTS = [{ 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', - 'md5': '05850eb8c749e2ee05ad5a1c34668493', 'info_dict': { - 'id': 'studio-c-season-5-episode-5', + 'id': '6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', + 'display_id': 'studio-c-season-5-episode-5', 'ext': 'mp4', - 'description': 'md5:e07269172baff037f8e8bf9956bc9747', 'title': 'Season 5 Episode 5', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:e07269172baff037f8e8bf9956bc9747', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1486.486, }, 'params': { 'skip_download': True, }, 'add_ie': ['Ooyala'], + }, { + 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + episode_code = self._search_regex( + r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information') + + ep = self._parse_json( + episode_code, display_id, transform_source=lambda s: + re.sub(r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', s)) + + if ep['providerType'] != 'Ooyala': + raise ExtractorError('Unsupported provider %s' % ep['provider']) + + return { + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:%s' % ep['providerId'], + 'id': video_id, + 'display_id': display_id, + 'title': ep['title'], + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + } + + +class BYUtvEventIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?byutv\.org/watch/event/(?P[0-9a-f-]+)' + _TEST = { + 'url': 'http://www.byutv.org/watch/event/29941b9b-8bf6-48d2-aebf-7a87add9e34b', + 'info_dict': { + 'id': '29941b9b-8bf6-48d2-aebf-7a87add9e34b', + 'ext': 'mp4', + 'title': 'Toledo vs. BYU (9/30/16)', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - episode_code = self._search_regex( - r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information') - episode_json = re.sub( - r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', episode_code) - ep = json.loads(episode_json) - if ep['providerType'] == 'Ooyala': - return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % ep['providerId'], - 'id': video_id, - 'title': ep['title'], - 'description': ep.get('description'), - 'thumbnail': ep.get('imageThumbnail'), - } - else: - raise ExtractorError('Unsupported provider %s' % ep['provider']) + ooyala_id = self._search_regex( + r'providerId\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'ooyala id', group='id') + + title = self._search_regex( + r'class=["\']description["\'][^>]*>\s*

([^<]+)

', webpage, + 'title').strip() + + return { + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:%s' % ooyala_id, + 'id': video_id, + 'title': title, + } diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 268c34392..8f0c6c545 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -26,7 +26,7 @@ class CamdemyIE(InfoExtractor): 'id': '5181', 'ext': 'mp4', 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'creator': 'ss11spring', 'duration': 1591, 'upload_date': '20130114', @@ -41,7 +41,7 @@ class CamdemyIE(InfoExtractor): 'id': '13885', 'ext': 'mp4', 'title': 'EverCam + Camdemy QuickStart', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', 'creator': 'evercam', 'duration': 318, @@ -112,7 +112,7 @@ class CamdemyIE(InfoExtractor): class CamdemyFolderIE(InfoExtractor): - _VALID_URL = r'https?://www.camdemy.com/folder/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P\d+)' _TESTS = [{ # links with trailing slash 'url': 'http://www.camdemy.com/folder/450', diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 61463f249..b3f76a7b1 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -6,11 +6,13 @@ import re from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( + dict_get, ExtractorError, HEADRequest, - unified_strdate, - qualities, int_or_none, + qualities, + remove_end, + unified_strdate, ) @@ -23,6 +25,7 @@ class CanalplusIE(InfoExtractor): (?:(?:www|m)\.)?canalplus\.fr| (?:www\.)?piwiplus\.fr| (?:www\.)?d8\.tv| + (?:www\.)?c8\.fr| (?:www\.)?d17\.tv| (?:www\.)?itele\.fr )/(?:(?:[^/]+/)*(?P[^/?#&]+))?(?:\?.*\bvid=(?P\d+))?| @@ -35,53 +38,53 @@ class CanalplusIE(InfoExtractor): 'canalplus': 'cplus', 'piwiplus': 'teletoon', 'd8': 'd8', + 'c8': 'd8', 'd17': 'd17', 'itele': 'itele', } _TESTS = [{ 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814', - 'md5': '41f438a4904f7664b91b4ed0dec969dc', 'info_dict': { - 'id': '1192814', + 'id': '1405510', + 'display_id': 'pid1830-c-zapping', 'ext': 'mp4', - 'title': "L'Année du Zapping 2014 - L'Année du Zapping 2014", - 'description': "Toute l'année 2014 dans un Zapping exceptionnel !", - 'upload_date': '20150105', + 'title': 'Zapping - 02/07/2016', + 'description': 'Le meilleur de toutes les chaînes, tous les jours', + 'upload_date': '20160702', }, }, { 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', 'info_dict': { 'id': '1108190', - 'ext': 'flv', - 'title': 'Le labyrinthe - Boing super ranger', + 'display_id': 'pid1405-le-labyrinthe-boing-super-ranger', + 'ext': 'mp4', + 'title': 'BOING SUPER RANGER - Ep : Le labyrinthe', 'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff', 'upload_date': '20140724', }, 'skip': 'Only works from France', }, { - 'url': 'http://www.d8.tv/d8-docs-mags/pid5198-d8-en-quete-d-actualite.html?vid=1390231', + 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html', + 'md5': '4b47b12b4ee43002626b97fad8fb1de5', 'info_dict': { - 'id': '1390231', + 'id': '1420213', + 'display_id': 'pid6318-videos-integrales', 'ext': 'mp4', - 'title': "Vacances pas chères : prix discount ou grosses dépenses ? - En quête d'actualité", - 'description': 'md5:edb6cf1cb4a1e807b5dd089e1ac8bfc6', - 'upload_date': '20160512', - }, - 'params': { - 'skip_download': True, + 'title': 'TPMP ! Même le matin - Les 35H de Baba - 14/10/2016', + 'description': 'md5:f96736c1b0ffaa96fd5b9e60ad871799', + 'upload_date': '20161014', }, + 'skip': 'Only works from France', }, { - 'url': 'http://www.itele.fr/chroniques/invite-bruce-toussaint/thierry-solere-nicolas-sarkozy-officialisera-sa-candidature-a-la-primaire-quand-il-le-voudra-167224', + 'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', 'info_dict': { - 'id': '1398334', + 'id': '1420176', + 'display_id': 'rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', 'ext': 'mp4', - 'title': "L'invité de Bruce Toussaint du 07/06/2016 - ", - 'description': 'md5:40ac7c9ad0feaeb6f605bad986f61324', - 'upload_date': '20160607', - }, - 'params': { - 'skip_download': True, + 'title': 'L\'invité de Michaël Darmon du 14/10/2016 - ', + 'description': 'Chaque matin du lundi au vendredi, Michaël Darmon reçoit un invité politique à 8h25.', + 'upload_date': '20161014', }, }, { 'url': 'http://m.canalplus.fr/?vid=1398231', @@ -93,18 +96,18 @@ class CanalplusIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.groupdict().get('id') or mobj.groupdict().get('vid') site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]] # Beware, some subclasses do not define an id group - display_id = mobj.group('display_id') or video_id + display_id = remove_end(dict_get(mobj.groupdict(), ('display_id', 'id', 'vid')), '.html') - if video_id is None: - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r']+?videoId=(["\'])(?P\d+)', r'id=["\']canal_video_player(?P\d+)'], - webpage, 'video id', group='id') + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + [r']+?videoId=(["\'])(?P\d+)', + r'id=["\']canal_video_player(?P\d+)', + r'data-video=["\'](?P\d+)'], + webpage, 'video id', default=mobj.group('vid'), group='id') info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) video_data = self._download_json(info_url, video_id, 'Downloading video JSON') diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index ec6d24d96..544c6657c 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import float_or_none class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?canvas\.be/video/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pcanvas|een)\.be/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', 'md5': 'ea838375a547ac787d4064d8c7860a6c', @@ -15,7 +17,7 @@ class CanvasIE(InfoExtractor): 'ext': 'mp4', 'title': 'De afspraak veilt voor de Warmste Week', 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 49.02, } }, { @@ -27,7 +29,7 @@ class CanvasIE(InfoExtractor): 'ext': 'mp4', 'title': 'Pieter 0167', 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2553.08, 'subtitles': { 'nl': [{ @@ -38,22 +40,42 @@ class CanvasIE(InfoExtractor): 'params': { 'skip_download': True, } + }, { + 'url': 'https://www.een.be/sorry-voor-alles/herbekijk-sorry-voor-alles', + 'info_dict': { + 'id': 'mz-ast-11a587f8-b921-4266-82e2-0bce3e80d07f', + 'display_id': 'herbekijk-sorry-voor-alles', + 'ext': 'mp4', + 'title': 'Herbekijk Sorry voor alles', + 'description': 'md5:8bb2805df8164e5eb95d6a7a29dc0dd3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3788.06, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + site_id, display_id = mobj.group('site_id'), mobj.group('id') webpage = self._download_webpage(url, display_id) - title = self._search_regex( + title = (self._search_regex( r']+class="video__body__header__title"[^>]*>(.+?)', - webpage, 'title', default=None) or self._og_search_title(webpage) + webpage, 'title', default=None) or self._og_search_title( + webpage)).strip() video_id = self._html_search_regex( - r'data-video=(["\'])(?P.+?)\1', webpage, 'video id', group='id') + r'data-video=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', group='id') data = self._download_json( - 'https://mediazone.vrt.be/api/v1/canvas/assets/%s' % video_id, display_id) + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), display_id) formats = [] for target in data['targetUrls']: @@ -67,6 +89,9 @@ class CanvasIE(InfoExtractor): elif format_type == 'HDS': formats.extend(self._extract_f4m_formats( format_url, display_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + formats.extend(self._extract_mpd_formats( + format_url, display_id, mpd_id=format_type, fatal=False)) else: formats.append({ 'format_id': format_type, diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py index 5797fb951..9ba909a91 100644 --- a/youtube_dl/extractor/carambatv.py +++ b/youtube_dl/extractor/carambatv.py @@ -9,6 +9,8 @@ from ..utils import ( try_get, ) +from .videomore import VideomoreIE + class CarambaTVIE(InfoExtractor): _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P\d+)' @@ -19,7 +21,7 @@ class CarambaTVIE(InfoExtractor): 'id': '191910501', 'ext': 'mp4', 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2678.31, }, }, { @@ -62,14 +64,16 @@ class CarambaTVPageIE(InfoExtractor): _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P[^/?#&]+)' _TEST = { 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/', - 'md5': '', + 'md5': 'a49fb0ec2ad66503eeb46aac237d3c86', 'info_dict': { - 'id': '191910501', - 'ext': 'mp4', + 'id': '475222', + 'ext': 'flv', 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 2678.31, + 'thumbnail': r're:^https?://.*\.jpg', + # duration reported by videomore is incorrect + 'duration': int, }, + 'add_ie': [VideomoreIE.ie_key()], } def _real_extract(self, url): @@ -77,6 +81,16 @@ class CarambaTVPageIE(InfoExtractor): webpage = self._download_webpage(url, video_id) + videomore_url = VideomoreIE._extract_url(webpage) + if videomore_url: + title = self._og_search_title(webpage) + return { + '_type': 'url_transparent', + 'url': videomore_url, + 'ie_key': VideomoreIE.ie_key(), + 'title': title, + } + video_url = self._og_search_property('video:iframe', webpage, default=None) if not video_url: diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py new file mode 100644 index 000000000..086ec90c9 --- /dev/null +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE + + +class CartoonNetworkIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P[^/?#]+)-(?:clip|episode)\.html' + _TEST = { + 'url': 'http://www.cartoonnetwork.com/video/teen-titans-go/starfire-the-cat-lady-clip.html', + 'info_dict': { + 'id': '8a250ab04ed07e6c014ef3f1e2f9016c', + 'ext': 'mp4', + 'title': 'Starfire the Cat Lady', + 'description': 'Robin decides to become a cat so that Starfire will finally love him.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + id_type, video_id = re.search(r"_cnglobal\.cvp(Video|Title)Id\s*=\s*'([^']+)';", webpage).groups() + query = ('id' if id_type == 'Video' else 'titleId') + '=' + video_id + return self._extract_cvp_info( + 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, { + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/toon/big', + 'tokenizer_src': 'http://www.cartoonnetwork.com/cntv/mvpd/processors/services/token_ipadAdobe.do', + }, + }, { + 'url': url, + 'site_name': 'CartoonNetwork', + 'auth_required': self._search_regex( + r'_cnglobal\.cvpFullOrPreviewAuth\s*=\s*(true|false);', + webpage, 'auth required', default='false') == 'true', + }) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index a87e97140..cf678e7f8 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -9,10 +9,19 @@ from ..utils import ( js_to_json, smuggle_url, try_get, + xpath_text, + xpath_element, + xpath_with_ns, + find_xpath_attr, + parse_iso8601, + parse_age_limit, + int_or_none, + ExtractorError, ) class CBCIE(InfoExtractor): + IE_NAME = 'cbc.ca' _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ # with mediaId @@ -81,39 +90,53 @@ class CBCIE(InfoExtractor): }, }], 'skip': 'Geo-restricted to Canada', + }, { + # multiple CBC.APP.Caffeine.initInstance(...) + 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', + 'info_dict': { + 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', + 'id': 'dog-indoor-exercise-winter-1.3928238', + }, + 'playlist_mincount': 6, }] @classmethod def suitable(cls, url): return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) + def _extract_player_init(self, player_init, display_id): + player_info = self._parse_json(player_init, display_id, js_to_json) + media_id = player_info.get('mediaId') + if not media_id: + clip_id = player_info['clipId'] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + if not media_id: + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] + return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - player_init = self._search_regex( - r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage, 'player init', - default=None) - if player_init: - player_info = self._parse_json(player_init, display_id, js_to_json) - media_id = player_info.get('mediaId') - if not media_id: - clip_id = player_info['clipId'] - feed = self._download_json( - 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, - clip_id, fatal=False) - if feed: - media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) - if not media_id: - media_id = self._download_json( - 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, - clip_id)['entries'][0]['id'].split('/')[-1] - return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) - else: - entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)] - return self.playlist_result(entries) + entries = [ + self._extract_player_init(player_init, display_id) + for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + entries.extend([ + self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) + return self.playlist_result( + entries, display_id, + self._og_search_title(webpage, fatal=False), + self._og_search_description(webpage)) class CBCPlayerIE(InfoExtractor): + IE_NAME = 'cbc.ca:player' _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P\d+)' _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', @@ -167,3 +190,166 @@ class CBCPlayerIE(InfoExtractor): }), 'id': video_id, } + + +class CBCWatchBaseIE(InfoExtractor): + _device_id = None + _device_token = None + _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/' + _NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', + } + + def _call_api(self, path, video_id): + url = path if path.startswith('http') else self._API_BASE_URL + path + result = self._download_xml(url, video_id, headers={ + 'X-Clearleap-DeviceId': self._device_id, + 'X-Clearleap-DeviceToken': self._device_token, + }) + error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') + if error_message: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) + return result + + def _real_initialize(self): + if not self._device_id or not self._device_token: + device = self._downloader.cache.load('cbcwatch', 'device') or {} + self._device_id, self._device_token = device.get('id'), device.get('token') + if not self._device_id or not self._device_token: + result = self._download_xml( + self._API_BASE_URL + 'device/register', + None, data=b'web') + self._device_id = xpath_text(result, 'deviceId', fatal=True) + self._device_token = xpath_text(result, 'deviceToken', fatal=True) + self._downloader.cache.store( + 'cbcwatch', 'device', { + 'id': self._device_id, + 'token': self._device_token, + }) + + def _parse_rss_feed(self, rss): + channel = xpath_element(rss, 'channel', fatal=True) + + def _add_ns(path): + return xpath_with_ns(path, self._NS_MAP) + + entries = [] + for item in channel.findall('item'): + guid = xpath_text(item, 'guid', fatal=True) + title = xpath_text(item, 'title', fatal=True) + + media_group = xpath_element(item, _add_ns('media:group'), fatal=True) + content = xpath_element(media_group, _add_ns('media:content'), fatal=True) + content_url = content.attrib['url'] + + thumbnails = [] + for thumbnail in media_group.findall(_add_ns('media:thumbnail')): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail.get('profile'), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + timestamp = None + release_date = find_xpath_attr( + item, _add_ns('media:credit'), 'role', 'releaseDate') + if release_date is not None: + timestamp = parse_iso8601(release_date.text) + + entries.append({ + '_type': 'url_transparent', + 'url': content_url, + 'id': guid, + 'title': title, + 'description': xpath_text(item, 'description'), + 'timestamp': timestamp, + 'duration': int_or_none(content.get('duration')), + 'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))), + 'episode': xpath_text(item, _add_ns('clearleap:episode')), + 'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))), + 'series': xpath_text(item, _add_ns('clearleap:series')), + 'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))), + 'thumbnails': thumbnails, + 'ie_key': 'CBCWatchVideo', + }) + + return self.playlist_result( + entries, xpath_text(channel, 'guid'), + xpath_text(channel, 'title'), + xpath_text(channel, 'description')) + + +class CBCWatchVideoIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch:video' + _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self._call_api(url, video_id) + + m3u8_url = xpath_text(result, 'url', fatal=True) + formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) + if len(formats) < 2: + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + for f in formats: + format_id = f.get('format_id') + if format_id.startswith('AAC'): + f['acodec'] = 'aac' + elif format_id.startswith('AC3'): + f['acodec'] = 'ac-3' + self._sort_formats(formats) + + info = { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + rss = xpath_element(result, 'rss') + if rss: + info.update(self._parse_rss_feed(rss)['entries'][0]) + del info['url'] + del info['_type'] + del info['ie_key'] + return info + + +class CBCWatchIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch' + _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P[0-9a-f-]+)' + _TESTS = [{ + 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', + 'info_dict': { + 'id': '38e815a-009e3ab12e4', + 'ext': 'mp4', + 'title': 'Customer (Dis)Service', + 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', + 'upload_date': '20160219', + 'timestamp': 1455840000, + }, + 'params': { + # m3u8 download + 'skip_download': True, + 'format': 'bestvideo', + }, + 'skip': 'Geo-restricted to Canada', + }, { + 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', + 'info_dict': { + 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', + 'title': 'Arthur', + 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', + }, + 'playlist_mincount': 30, + 'skip': 'Geo-restricted to Canada', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + rss = self._call_api('web/browse/' + video_id, video_id) + return self._parse_rss_feed(rss) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index a23173d6f..58f258c54 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -4,6 +4,9 @@ from .theplatform import ThePlatformFeedIE from ..utils import ( int_or_none, find_xpath_attr, + xpath_element, + xpath_text, + update_url_query, ) @@ -17,19 +20,6 @@ class CBSBaseIE(ThePlatformFeedIE): }] } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] - def _extract_video_info(self, filter_query, video_id): - return self._extract_feed_info( - 'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: { - 'series': entry.get('cbs$SeriesTitle'), - 'season_number': int_or_none(entry.get('cbs$SeasonNumber')), - 'episode': entry.get('cbs$EpisodeTitle'), - 'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')), - }, { - 'StreamPack': { - 'manifest': 'm3u', - } - }) - class CBSIE(CBSBaseIE): _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' @@ -38,7 +28,6 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', - 'display_id': 'connect-chat-feat-garth-brooks', 'ext': 'mp4', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', @@ -47,7 +36,10 @@ class CBSIE(CBSBaseIE): 'upload_date': '20131127', 'uploader': 'CBSI-NEW', }, - 'expected_warnings': ['Failed to download m3u8 information'], + 'params': { + # m3u8 download + 'skip_download': True, + }, '_skip': 'Blocked outside the US', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', @@ -56,8 +48,53 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] - TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' + + def _extract_video_info(self, content_id): + items_data = self._download_xml( + 'http://can.cbs.com/thunder/player/videoPlayerService.php', + content_id, query={'partner': 'cbs', 'contentId': content_id}) + video_data = xpath_element(items_data, './/item') + title = xpath_text(video_data, 'videoTitle', 'title', True) + tp_path = 'dJ5BDC/media/guid/2198311517/%s' % content_id + tp_release_url = 'http://link.theplatform.com/s/' + tp_path + + asset_types = [] + subtitles = {} + formats = [] + for item in items_data.findall('.//item'): + asset_type = xpath_text(item, 'assetType') + if not asset_type or asset_type in asset_types: + continue + asset_types.append(asset_type) + query = { + 'mbr': 'true', + 'assetTypes': asset_type, + } + if asset_type.startswith('HLS') or asset_type in ('OnceURL', 'StreamPack'): + query['formats'] = 'MPEG4,M3U' + elif asset_type in ('RTMP', 'WIFI', '3G'): + query['formats'] = 'MPEG4,FLV' + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query(tp_release_url, query), content_id, + 'Downloading %s SMIL data' % asset_type) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + self._sort_formats(formats) + + info = self._extract_theplatform_metadata(tp_path, content_id) + info.update({ + 'id': content_id, + 'title': title, + 'series': xpath_text(video_data, 'seriesTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), + 'thumbnail': xpath_text(video_data, 'previewImageURL'), + 'formats': formats, + 'subtitles': subtitles, + }) + return info def _real_extract(self, url): content_id = self._match_id(url) - return self._extract_video_info('byGuid=%s' % content_id, content_id) + return self._extract_video_info(content_id) diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 821db20b2..57b18e81d 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -63,7 +63,7 @@ class CBSInteractiveIE(ThePlatformIE): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"data-(?:cnet|zdnet)-video(?:-uvp)?-options='([^']+)'", + r"data-(?:cnet|zdnet)-video(?:-uvp(?:js)?)?-options='([^']+)'", webpage, 'data json') data = self._parse_json(data_json, display_id) vdata = data.get('video') or data['videos'][0] diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 008c5fe32..8d5f11dd1 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -4,11 +4,14 @@ from __future__ import unicode_literals from .anvato import AnvatoIE from .sendtonews import SendtoNewsIE from ..compat import compat_urlparse -from ..utils import unified_timestamp +from ..utils import ( + parse_iso8601, + unified_timestamp, +) class CBSLocalIE(AnvatoIE): - _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/\d+/\d+/\d+/(?P[0-9a-z-]+)' + _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P[0-9a-z-]+)' _TESTS = [{ # Anvato backend @@ -22,6 +25,7 @@ class CBSLocalIE(AnvatoIE): 'thumbnail': 're:^https?://.*', 'timestamp': 1463440500, 'upload_date': '20160516', + 'uploader': 'CBS', 'subtitles': { 'en': 'mincount:5', }, @@ -35,23 +39,44 @@ class CBSLocalIE(AnvatoIE): 'Syndication\\Curb.tv', 'Content\\News' ], + 'tags': ['CBS 2 News Evening'], }, }, { # SendtoNews embed 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', 'info_dict': { 'id': 'GxfCe0Zo7D-175909-5588', - 'ext': 'mp4', - 'title': 'Recap: CLE 15, CIN 6', - 'description': '5/16/16: Indians\' bats explode for 15 runs in a win', - 'upload_date': '20160516', - 'timestamp': 1463433840, - 'duration': 49, }, + 'playlist_count': 9, 'params': { # m3u8 download 'skip_download': True, }, + }, { + 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', + 'info_dict': { + 'id': '3580809', + 'ext': 'mp4', + 'title': 'A Very Blue Anniversary', + 'description': 'CBS2’s Cindy Hsu has more.', + 'thumbnail': 're:^https?://.*', + 'timestamp': 1479962220, + 'upload_date': '20161124', + 'uploader': 'CBS', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\WCBSTV', + 'Syndication\\AOL', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\Yahoo', + 'Content\\News', + 'Content\\News\\Local News', + ], + 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], + }, }] def _real_extract(self, url): @@ -60,16 +85,18 @@ class CBSLocalIE(AnvatoIE): sendtonews_url = SendtoNewsIE._extract_url(webpage) if sendtonews_url: - info_dict = { - '_type': 'url_transparent', - 'url': compat_urlparse.urljoin(url, sendtonews_url), - } - else: - info_dict = self._extract_anvato_videos(webpage, display_id) + return self.url_result( + compat_urlparse.urljoin(url, sendtonews_url), + ie=SendtoNewsIE.ie_key()) + + info_dict = self._extract_anvato_videos(webpage, display_id) time_str = self._html_search_regex( - r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False) - timestamp = unified_timestamp(time_str) + r'class="entry-date">([^<]+)<', webpage, 'released date', default=None) + if time_str: + timestamp = unified_timestamp(time_str) + else: + timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage)) info_dict.update({ 'display_id': display_id, diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 9328e3e20..17bb9af4f 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -1,14 +1,15 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from .cbs import CBSBaseIE +from .cbs import CBSIE from ..utils import ( parse_duration, ) -class CBSNewsIE(CBSBaseIE): +class CBSNewsIE(CBSIE): + IE_NAME = 'cbsnews' IE_DESC = 'CBS News' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' @@ -35,9 +36,10 @@ class CBSNewsIE(CBSBaseIE): 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', - 'upload_date': '19700101', + 'upload_date': '20140404', + 'timestamp': 1396650660, 'uploader': 'CBSI-NEW', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { 'en': [{ @@ -63,51 +65,43 @@ class CBSNewsIE(CBSBaseIE): item = video_info['item'] if 'item' in video_info else video_info guid = item['mpxRefId'] - return self._extract_video_info('byGuid=%s' % guid, guid) + return self._extract_video_info(guid) class CBSNewsLiveVideoIE(InfoExtractor): + IE_NAME = 'cbsnews:livevideo' IE_DESC = 'CBS News Live Videos' - _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[^/?#]+)' - _TESTS = [{ + # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples + _TEST = { 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { 'id': 'clinton-sanders-prepare-to-face-off-in-nh', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Clinton, Sanders Prepare To Face Off In NH', 'duration': 334, }, - 'skip': 'Video gone, redirected to http://www.cbsnews.com/live/', - }, { - 'url': 'http://www.cbsnews.com/live/video/video-shows-intense-paragliding-accident/', - 'info_dict': { - 'id': 'video-shows-intense-paragliding-accident', - 'ext': 'flv', - 'title': 'Video Shows Intense Paragliding Accident', - }, - }] + 'skip': 'Video gone', + } def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video_info = self._download_json( + 'http://feeds.cbsn.cbsnews.com/rundown/story', display_id, query={ + 'device': 'desktop', + 'dvr_slug': display_id, + }) - video_info = self._parse_json(self._html_search_regex( - r'data-story-obj=\'({.+?})\'', webpage, 'video JSON info'), video_id)['story'] - - hdcore_sign = 'hdcore=3.3.1' - f4m_formats = self._extract_f4m_formats(video_info['url'] + '&' + hdcore_sign, video_id) - if f4m_formats: - for entry in f4m_formats: - # URLs without the extra param induce an 404 error - entry.update({'extra_param_to_segment_url': hdcore_sign}) - self._sort_formats(f4m_formats) + formats = self._extract_akamai_formats(video_info['url'], display_id) + self._sort_formats(formats) return { - 'id': video_id, + 'id': display_id, + 'display_id': display_id, 'title': video_info['headline'], 'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), 'duration': parse_duration(video_info.get('segmentDur')), - 'formats': f4m_formats, + 'formats': formats, } diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 78ca44b02..3a62c840b 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -4,7 +4,7 @@ from .cbs import CBSBaseIE class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://www\.cbssports\.com/video/player/[^/]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/video/player/[^/]+/(?P\d+)' _TESTS = [{ 'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast', @@ -23,6 +23,9 @@ class CBSSportsIE(CBSBaseIE): } }] + def _extract_video_info(self, filter_query, video_id): + return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_video_info('byId=%s' % video_id, video_id) diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index 8f7f09e22..734702144 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -19,7 +19,7 @@ class CCCIE(InfoExtractor): 'ext': 'mp4', 'title': 'Introduction to Processor Design', 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20131228', 'timestamp': 1388188800, 'duration': 3710, @@ -32,7 +32,7 @@ class CCCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - event_id = self._search_regex("data-id='(\d+)'", webpage, 'event id') + event_id = self._search_regex(r"data-id='(\d+)'", webpage, 'event id') event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id) formats = [] diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py new file mode 100644 index 000000000..39938c9ac --- /dev/null +++ b/youtube_dl/extractor/ccma.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + clean_html, +) + + +class CCMAIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?Pvideo|audio)/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/', + 'md5': '7296ca43977c8ea4469e719c609b0871', + 'info_dict': { + 'id': '5630208', + 'ext': 'mp4', + 'title': 'L\'espot de La Marató de TV3', + 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', + 'timestamp': 1470918540, + 'upload_date': '20160811', + } + }, { + 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', + 'md5': 'fa3e38f269329a278271276330261425', + 'info_dict': { + 'id': '943685', + 'ext': 'mp3', + 'title': 'El Consell de Savis analitza el derbi', + 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', + 'upload_date': '20171205', + 'timestamp': 1512507300, + } + }] + + def _real_extract(self, url): + media_type, media_id = re.match(self._VALID_URL, url).groups() + media_data = {} + formats = [] + profiles = ['pc'] if media_type == 'audio' else ['mobil', 'pc'] + for i, profile in enumerate(profiles): + md = self._download_json('http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + 'media': media_type, + 'idint': media_id, + 'profile': profile, + }, fatal=False) + if md: + media_data = md + media_url = media_data.get('media', {}).get('url') + if media_url: + formats.append({ + 'format_id': profile, + 'url': media_url, + 'quality': i, + }) + self._sort_formats(formats) + + informacio = media_data['informacio'] + title = informacio['titol'] + durada = informacio.get('durada', {}) + duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) + timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + + subtitles = {} + subtitols = media_data.get('subtitols', {}) + if subtitols: + sub_url = subtitols.get('url') + if sub_url: + subtitles.setdefault( + subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + 'url': sub_url, + }) + + thumbnails = [] + imatges = media_data.get('imatges', {}) + if imatges: + thumbnail_url = imatges.get('url') + if thumbnail_url: + thumbnails = [{ + 'url': thumbnail_url, + 'width': int_or_none(imatges.get('amplada')), + 'height': int_or_none(imatges.get('alcada')), + }] + + return { + 'id': media_id, + 'title': title, + 'description': clean_html(informacio.get('descripcio')), + 'duration': duration, + 'timestamp': timestamp, + 'thumnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cctv.py b/youtube_dl/extractor/cctv.py new file mode 100644 index 000000000..c76f361c6 --- /dev/null +++ b/youtube_dl/extractor/cctv.py @@ -0,0 +1,191 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + try_get, + unified_timestamp, +) + + +class CCTVIE(InfoExtractor): + IE_DESC = '央视网' + _VALID_URL = r'https?://(?:(?:[^/]+)\.(?:cntv|cctv)\.(?:com|cn)|(?:www\.)?ncpa-classic\.com)/(?:[^/]+/)*?(?P[^/?#&]+?)(?:/index)?(?:\.s?html|[?#&]|$)' + _TESTS = [{ + # fo.addVariable("videoCenterId","id") + 'url': 'http://sports.cntv.cn/2016/02/12/ARTIaBRxv4rTT1yWf1frW2wi160212.shtml', + 'md5': 'd61ec00a493e09da810bf406a078f691', + 'info_dict': { + 'id': '5ecdbeab623f4973b40ff25f18b174e8', + 'ext': 'mp4', + 'title': '[NBA]二少联手砍下46分 雷霆主场击败鹈鹕(快讯)', + 'description': 'md5:7e14a5328dc5eb3d1cd6afbbe0574e95', + 'duration': 98, + 'uploader': 'songjunjie', + 'timestamp': 1455279956, + 'upload_date': '20160212', + }, + }, { + # var guid = "id" + 'url': 'http://tv.cctv.com/2016/02/05/VIDEUS7apq3lKrHG9Dncm03B160205.shtml', + 'info_dict': { + 'id': 'efc5d49e5b3b4ab2b34f3a502b73d3ae', + 'ext': 'mp4', + 'title': '[赛车]“车王”舒马赫恢复情况成谜(快讯)', + 'description': '2月4日,蒙特泽莫罗透露了关于“车王”舒马赫恢复情况,但情况是否属实遭到了质疑。', + 'duration': 37, + 'uploader': 'shujun', + 'timestamp': 1454677291, + 'upload_date': '20160205', + }, + 'params': { + 'skip_download': True, + }, + }, { + # changePlayer('id') + 'url': 'http://english.cntv.cn/special/four_comprehensives/index.shtml', + 'info_dict': { + 'id': '4bb9bb4db7a6471ba85fdeda5af0381e', + 'ext': 'mp4', + 'title': 'NHnews008 ANNUAL POLITICAL SEASON', + 'description': 'Four Comprehensives', + 'duration': 60, + 'uploader': 'zhangyunlei', + 'timestamp': 1425385521, + 'upload_date': '20150303', + }, + 'params': { + 'skip_download': True, + }, + }, { + # loadvideo('id') + 'url': 'http://cctv.cntv.cn/lm/tvseries_russian/yilugesanghua/index.shtml', + 'info_dict': { + 'id': 'b15f009ff45c43968b9af583fc2e04b2', + 'ext': 'mp4', + 'title': 'Путь,усыпанный космеями Серия 1', + 'description': 'Путь, усыпанный космеями', + 'duration': 2645, + 'uploader': 'renxue', + 'timestamp': 1477479241, + 'upload_date': '20161026', + }, + 'params': { + 'skip_download': True, + }, + }, { + # var initMyAray = 'id' + 'url': 'http://www.ncpa-classic.com/2013/05/22/VIDE1369219508996867.shtml', + 'info_dict': { + 'id': 'a194cfa7f18c426b823d876668325946', + 'ext': 'mp4', + 'title': '小泽征尔音乐塾 音乐梦想无国界', + 'duration': 2173, + 'timestamp': 1369248264, + 'upload_date': '20130522', + }, + 'params': { + 'skip_download': True, + }, + }, { + # var ids = ["id"] + 'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml', + 'info_dict': { + 'id': 'a8606119a4884588a79d81c02abecc16', + 'ext': 'mp3', + 'title': '来自维也纳的新年贺礼', + 'description': 'md5:f13764ae8dd484e84dd4b39d5bcba2a7', + 'duration': 1578, + 'uploader': 'djy', + 'timestamp': 1482942419, + 'upload_date': '20161228', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://ent.cntv.cn/2016/01/18/ARTIjprSSJH8DryTVr5Bx8Wb160118.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cntv.cn/video/C39296/e0210d949f113ddfb38d31f00a4e5c44', + 'only_matching': True, + }, { + 'url': 'http://english.cntv.cn/2016/09/03/VIDEhnkB5y9AgHyIEVphCEz1160903.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cctv.com/2016/09/07/VIDE5C1FnlX5bUywlrjhxXOV160907.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cntv.cn/video/C39296/95cfac44cabd3ddc4a9438780a4e5c44', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + [r'var\s+guid\s*=\s*["\']([\da-fA-F]+)', + r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)', + r'changePlayer\s*\(\s*["\']([\da-fA-F]+)', + r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)', + r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)', + r'var\s+ids\s*=\s*\[["\']([\da-fA-F]+)'], + webpage, 'video id') + + data = self._download_json( + 'http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do', video_id, + query={ + 'pid': video_id, + 'url': url, + 'idl': 32, + 'idlr': 32, + 'modifyed': 'false', + }) + + title = data['title'] + + formats = [] + + video = data.get('video') + if isinstance(video, dict): + for quality, chapters_key in enumerate(('lowChapters', 'chapters')): + video_url = try_get( + video, lambda x: x[chapters_key][0]['url'], compat_str) + if video_url: + formats.append({ + 'url': video_url, + 'format_id': 'http', + 'quality': quality, + 'preference': -1, + }) + + hls_url = try_get(data, lambda x: x['hls_url'], compat_str) + if hls_url: + hls_url = re.sub(r'maxbr=\d+&?', '', hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + + uploader = data.get('editer_name') + description = self._html_search_meta( + 'description', webpage, default=None) + timestamp = unified_timestamp(data.get('f_pgmtime')) + duration = float_or_none(try_get(video, lambda x: x['totalLength'])) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 8af318703..ae7af2f0e 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -5,14 +5,16 @@ import re from .common import InfoExtractor from ..utils import ( - decode_packed_codes, ExtractorError, - parse_duration + float_or_none, + int_or_none, + parse_duration, ) class CDAIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P[0-9a-z]+)' + _BASE_URL = 'http://www.cda.pl/' _TESTS = [{ 'url': 'http://www.cda.pl/video/5749950c', 'md5': '6f844bf51b15f31fae165365707ae970', @@ -21,6 +23,9 @@ class CDAIE(InfoExtractor): 'ext': 'mp4', 'height': 720, 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'description': 'md5:269ccd135d550da90d1662651fcb9772', + 'thumbnail': r're:^https?://.*\.jpg$', + 'average_rating': float, 'duration': 39 } }, { @@ -30,6 +35,11 @@ class CDAIE(InfoExtractor): 'id': '57413289', 'ext': 'mp4', 'title': 'Lądowanie na lotnisku na Maderze', + 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'crash404', + 'view_count': int, + 'average_rating': float, 'duration': 137 } }, { @@ -39,31 +49,55 @@ class CDAIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id) + self._set_cookie('cda.pl', 'cda.player', 'html5') + webpage = self._download_webpage( + self._BASE_URL + '/video/' + video_id, video_id) if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) - title = self._html_search_regex(r'(.+?)', webpage, 'title') - formats = [] + uploader = self._search_regex(r'''(?x) + <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*> + (?:<\1[^>]*>[^<]*|(?!)(?:.|\n))*? + <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P[^<]+) + ''', webpage, 'uploader', default=None, group='uploader') + view_count = self._search_regex( + r'Odsłony:(?:\s| )*([0-9]+)', webpage, + 'view_count', default=None) + average_rating = self._search_regex( + r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', + webpage, 'rating', fatal=False, group='rating_value') + info_dict = { 'id': video_id, - 'title': title, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'uploader': uploader, + 'view_count': int_or_none(view_count), + 'average_rating': float_or_none(average_rating), + 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'duration': None, } def extract_format(page, version): - unpacked = decode_packed_codes(page) - format_url = self._search_regex( - r"(?:file|url)\s*:\s*(\\?[\"'])(?Phttp.+?)\1", unpacked, - '%s url' % version, fatal=False, group='url') - if not format_url: + json_str = self._search_regex( + r'player_data=(\\?["\'])(?P.+?)\1', page, + '%s player_json' % version, fatal=False, group='player_data') + if not json_str: + return + player_data = self._parse_json( + json_str, '%s player_data' % version, fatal=False) + if not player_data: + return + video = player_data.get('video') + if not video or 'file' not in video: + self.report_warning('Unable to extract %s version information' % version) return f = { - 'url': format_url, + 'url': video['file'], } m = re.search( r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', @@ -75,9 +109,7 @@ class CDAIE(InfoExtractor): }) info_dict['formats'].append(f) if not info_dict['duration']: - info_dict['duration'] = parse_duration(self._search_regex( - r"duration\s*:\s*(\\?[\"'])(?P.+?)\1", - unpacked, 'duration', fatal=False, group='duration')) + info_dict['duration'] = parse_duration(video.get('duration')) extract_format(webpage, 'default') @@ -85,7 +117,8 @@ class CDAIE(InfoExtractor): r']+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', webpage): webpage = self._download_webpage( - href, video_id, 'Downloading %s version information' % resolution, fatal=False) + self._BASE_URL + href, video_id, + 'Downloading %s version information' % resolution, fatal=False) if not webpage: # Manually report warning because empty page is returned when # invalid version is requested. diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 5a58d1777..4f88c31ad 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -17,7 +17,7 @@ from ..utils import ( class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P[^/#?]+)/*(?:[#?].*)?$' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P[^/#?]+)/*(?:[#?].*)?$' _TESTS = [{ 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', 'info_dict': { @@ -25,7 +25,7 @@ class CeskaTelevizeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Hyde Park Civilizace', 'description': 'md5:fe93f6eda372d150759d11644ebbfb4a', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 3350, }, 'params': { @@ -39,7 +39,7 @@ class CeskaTelevizeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Hyde Park Civilizace: Bonus 01 - En', 'description': 'English Subtittles', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 81.3, }, 'params': { @@ -52,7 +52,7 @@ class CeskaTelevizeIE(InfoExtractor): 'info_dict': { 'id': 402, 'ext': 'mp4', - 'title': 're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, }, 'params': { @@ -80,7 +80,7 @@ class CeskaTelevizeIE(InfoExtractor): 'id': '61924494877068022', 'ext': 'mp4', 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 1558.3, }, }], diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 34d4e6156..865dbcaba 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -31,7 +31,7 @@ class Channel9IE(InfoExtractor): 'title': 'Developer Kick-Off Session: Stuff We Love', 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 'duration': 4576, - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'session_code': 'KOS002', 'session_day': 'Day 1', 'session_room': 'Arena 1A', @@ -47,7 +47,7 @@ class Channel9IE(InfoExtractor): 'title': 'Self-service BI with Power BI - nuclear testing', 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 'duration': 1540, - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'authors': ['Mike Wilmot'], }, }, { @@ -59,7 +59,7 @@ class Channel9IE(InfoExtractor): 'title': 'Ranges for the Standard Library', 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', 'duration': 5646, - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py new file mode 100644 index 000000000..2d517f231 --- /dev/null +++ b/youtube_dl/extractor/charlierose.py @@ -0,0 +1,51 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class CharlieRoseIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P\d+)' + _TESTS = [{ + 'url': 'https://charlierose.com/videos/27996', + 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', + 'info_dict': { + 'id': '27996', + 'ext': 'mp4', + 'title': 'Remembering Zaha Hadid', + 'thumbnail': r're:^https?://.*\.jpg\?\d+', + 'description': 'We revisit past conversations with Zaha Hadid, in memory of the world renowned Iraqi architect.', + 'subtitles': { + 'en': [{ + 'ext': 'vtt', + }], + }, + }, + }, { + 'url': 'https://charlierose.com/videos/27996', + 'only_matching': True, + }] + + _PLAYER_BASE = 'https://charlierose.com/video/player/%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(self._PLAYER_BASE % video_id, video_id) + + title = remove_end(self._og_search_title(webpage), ' - Charlie Rose') + + info_dict = self._parse_html5_media_entries( + self._PLAYER_BASE % video_id, webpage, video_id, + m3u8_entry_protocol='m3u8_native')[0] + + self._sort_formats(info_dict['formats']) + self._remove_duplicate_formats(info_dict['formats']) + + info_dict.update({ + 'id': video_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + }) + + return info_dict diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index b2234549e..8fbc91c1f 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ExtractorError @@ -17,7 +19,8 @@ class ChaturbateIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Room is offline', }, { 'url': 'https://en.chaturbate.com/siswet19/', 'only_matching': True, @@ -30,30 +33,35 @@ class ChaturbateIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - m3u8_url = self._search_regex( - r'src=(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage, - 'playlist', default=None, group='url') + m3u8_formats = [(m.group('id').lower(), m.group('url')) for m in re.finditer( + r'hlsSource(?P.+?)\s*=\s*(?P["\'])(?Phttp.+?)(?P=q)', webpage)] - if not m3u8_url: + if not m3u8_formats: error = self._search_regex( [r']+class=(["\'])desc_span\1[^>]*>(?P[^<]+)', r']+id=(["\'])defchat\1[^>]*>\s*

(?P[^<]+)<'], webpage, 'error', group='error', default=None) if not error: - if any(p not in webpage for p in ( + if any(p in webpage for p in ( self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')): error = self._ROOM_OFFLINE if error: raise ExtractorError(error, expected=True) raise ExtractorError('Unable to find stream URL') - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + formats = [] + for m3u8_id, m3u8_url in m3u8_formats: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', + # ffmpeg skips segments for fast m3u8 + preference=-10 if m3u8_id == 'fast' else None, + m3u8_id=m3u8_id, fatal=False, live=True)) self._sort_formats(formats) return { 'id': video_id, 'title': self._live_title(video_id), - 'thumbnail': 'https://cdn-s.highwebmedia.com/uHK3McUtGCG3SMFcd4ZJsRv8/roomimage/%s.jpg' % video_id, + 'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id, 'age_limit': self._rta_search(webpage), 'is_live': True, 'formats': formats, diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index b1eeaf101..4815b34be 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,30 +1,35 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import re + from .common import InfoExtractor -from ..utils import ( - parse_duration, - int_or_none, -) +from ..utils import parse_duration class ChirbitIE(InfoExtractor): IE_NAME = 'chirbit' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' _TESTS = [{ - 'url': 'http://chirb.it/PrIPv5', - 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'url': 'http://chirb.it/be2abG', 'info_dict': { - 'id': 'PrIPv5', + 'id': 'be2abG', 'ext': 'mp3', - 'title': 'Фасадстрой', - 'duration': 52, - 'view_count': int, - 'comment_count': int, + 'title': 'md5:f542ea253f5255240be4da375c6a5d7e', + 'description': 'md5:f24a4e22a71763e32da5fed59e47c770', + 'duration': 306, + 'uploader': 'Gerryaudio', + }, + 'params': { + 'skip_download': True, } }, { 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', 'only_matching': True, + }, { + 'url': 'https://chirb.it/wp/MN58c2', + 'only_matching': True, }] def _real_extract(self, url): @@ -33,38 +38,44 @@ class ChirbitIE(InfoExtractor): webpage = self._download_webpage( 'http://chirb.it/%s' % audio_id, audio_id) - audio_url = self._search_regex( - r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') + data_fd = self._search_regex( + r'data-fd=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'data fd', group='url') + + # Reverse engineered from https://chirb.it/js/chirbit.player.js (look + # for soundURL) + audio_url = base64.b64decode( + data_fd[::-1].encode('ascii')).decode('utf-8') title = self._search_regex( - r'itemprop="name">([^<]+)', webpage, 'title') - duration = parse_duration(self._html_search_meta( - 'duration', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - r'itemprop="playCount"\s*>(\d+)', webpage, - 'listen count', fatal=False)) - comment_count = int_or_none(self._search_regex( - r'>(\d+) Comments?:', webpage, - 'comment count', fatal=False)) + r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') + description = self._search_regex( + r'

Description

\s*]*>([^<]+)', + webpage, 'description', default=None) + duration = parse_duration(self._search_regex( + r'class=["\']c-length["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) + uploader = self._search_regex( + r'id=["\']chirbit-username["\'][^>]*>([^<]+)', + webpage, 'uploader', fatal=False) return { 'id': audio_id, 'url': audio_url, 'title': title, + 'description': description, 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, + 'uploader': uploader, } class ChirbitProfileIE(InfoExtractor): IE_NAME = 'chirbit:profile' - _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', 'info_dict': { 'id': 'ScarletBeauty', - 'title': 'Chirbits by ScarletBeauty', }, 'playlist_mincount': 3, } @@ -72,13 +83,10 @@ class ChirbitProfileIE(InfoExtractor): def _real_extract(self, url): profile_id = self._match_id(url) - rss = self._download_xml( - 'http://chirbit.com/rss/%s' % profile_id, profile_id) + webpage = self._download_webpage(url, profile_id) entries = [ - self.url_result(audio_url.text, 'Chirbit') - for audio_url in rss.findall('./channel/item/link')] + self.url_result(self._proto_relative_url('//chirb.it/' + video_id)) + for _, video_id in re.findall(r']+id=([\'"])copy-btn-(?P[0-9a-zA-Z]+)\1', webpage)] - title = rss.find('./channel/title').text - - return self.playlist_result(entries, profile_id, title) + return self.playlist_result(entries, profile_id) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 3a47f6fa4..bb52e0c6f 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -10,15 +11,15 @@ from ..utils import ( class ClipfishIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P[0-9]+)' _TEST = { - 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', - 'md5': '79bc922f3e8a9097b3d68a93780fd475', + 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/', + 'md5': '720563e467b86374c194bdead08d207d', 'info_dict': { - 'id': '3966754', + 'id': '4343170', 'ext': 'mp4', - 'title': 'FIFA 14 - E3 2013 Trailer', - 'description': 'Video zu FIFA 14: E3 2013 Trailer', - 'upload_date': '20130611', - 'duration': 82, + 'title': 'S01 E01 - Ugly Americans - Date in der Hölle', + 'description': 'Mark Lilly arbeitet im Sozialdienst der Stadt New York und soll Immigranten bei ihrer Einbürgerung in die USA zur Seite stehen.', + 'upload_date': '20161005', + 'duration': 1291, 'view_count': int, } } @@ -50,10 +51,14 @@ class ClipfishIE(InfoExtractor): 'tbr': int_or_none(video_info.get('bitrate')), }) + descr = video_info.get('descr') + if descr: + descr = descr.strip() + return { 'id': video_id, 'title': video_info['title'], - 'description': video_info.get('descr'), + 'description': descr, 'formats': formats, 'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'), 'duration': int_or_none(video_info.get('media_length')), diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 252c2e846..ab651d1c8 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -30,7 +30,7 @@ class CliphunterIE(InfoExtractor): 'id': '1012420', 'ext': 'flv', 'title': 'Fun Jynx Maze solo', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 18, }, 'skip': 'Video gone', @@ -41,7 +41,7 @@ class CliphunterIE(InfoExtractor): 'id': '2019449', 'ext': 'mp4', 'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 18, }, }] diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index 0b6ad895f..6cdb42f5a 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -18,7 +18,7 @@ class ClipsyndicateIE(InfoExtractor): 'ext': 'mp4', 'title': 'Brick Briscoe', 'duration': 612, - 'thumbnail': 're:^https?://.+\.jpg', + 'thumbnail': r're:^https?://.+\.jpg', }, }, { 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack', diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 2fba93543..98f9cb596 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -1,9 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import re - from .common import InfoExtractor from ..utils import ( clean_html, @@ -22,7 +19,7 @@ class ClubicIE(InfoExtractor): 'ext': 'mp4', 'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité', 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', - 'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$', + 'thumbnail': r're:^http://img\.clubic\.com/.*\.jpg$', } }, { 'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html', @@ -30,16 +27,14 @@ class ClubicIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id player_page = self._download_webpage(player_url, video_id) - config_json = self._search_regex( + config = self._parse_json(self._search_regex( r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page, - 'configuration') - config = json.loads(config_json) + 'configuration'), video_id) video_info = config['videoInfo'] sources = config['sources'] diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index f24568dcc..e701fbeab 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -1,13 +1,11 @@ from __future__ import unicode_literals from .mtv import MTVIE -from ..utils import ExtractorError class CMTIE(MTVIE): IE_NAME = 'cmt.com' - _VALID_URL = r'https?://www\.cmt\.com/(?:videos|shows)/(?:[^/]+/)*(?P\d+)' - _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/' + _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', @@ -26,19 +24,31 @@ class CMTIE(MTVIE): 'id': '1504699', 'ext': 'mp4', 'title': 'Still The King Ep. 109 in 3 Minutes', - 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9. New episodes Sundays 9/8c.', + 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9.', 'timestamp': 1469421000.0, 'upload_date': '20160725', }, }, { 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', 'only_matching': True, + }, { + 'url': 'http://www.cmt.com/full-episodes/537qb3/nashville-the-wayfaring-stranger-season-5-ep-501', + 'only_matching': True, + }, { + 'url': 'http://www.cmt.com/video-clips/t9e4ci/nashville-juliette-in-2-minutes', + 'only_matching': True, }] - @classmethod - def _transform_rtmp_url(cls, rtmp_video_url): - if 'error_not_available.swf' in rtmp_video_url: - raise ExtractorError( - '%s said: video is not available' % cls.IE_NAME, expected=True) + def _extract_mgid(self, webpage): + mgid = self._search_regex( + r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P.+?)\1', + webpage, 'mgid', group='mgid', default=None) + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + return mgid - return super(CMTIE, cls)._transform_rtmp_url(rtmp_video_url) + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + mgid = self._extract_mgid(webpage) + return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 53489a14e..5fc311f53 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -3,15 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - url_basename, -) +from .turner import TurnerBaseIE +from ..utils import url_basename -class CNNIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ +class CNNIE(TurnerBaseIE): + _VALID_URL = r'''(?x)https?://(?:(?Pedition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ (?P.+?/(?P[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ @@ -25,6 +22,7 @@ class CNNIE(InfoExtractor): 'duration': 135, 'upload_date': '20130609', }, + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', @@ -34,7 +32,8 @@ class CNNIE(InfoExtractor): 'title': "Student's epic speech stuns new freshmen", 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", 'upload_date': '20130821', - } + }, + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', 'md5': 'f14d02ebd264df951feb2400e2c25a1b', @@ -44,80 +43,61 @@ class CNNIE(InfoExtractor): 'title': 'Nashville Ep. 1: Hand crafted skateboards', 'description': 'md5:e7223a503315c9f150acac52e76de086', 'upload_date': '20141222', - } + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', + 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', + 'info_dict': { + 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', + 'ext': 'mp4', + 'title': '5 stunning stats about Netflix', + 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', + 'upload_date': '20160819', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', 'only_matching': True, }, { 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', 'only_matching': True, + }, { + 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', + 'only_matching': True, }] + _CONFIG = { + # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml + 'edition': { + 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', + 'media_src': 'http://pmd.cdn.turner.com/cnn/big', + }, + # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml + 'money': { + 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', + 'media_src': 'http://ht3.cdn.turner.com/money/big', + }, + } + + def _extract_timestamp(self, video_data): + # TODO: fix timestamp extraction + return None + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - path = mobj.group('path') - page_title = mobj.group('title') - info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path - info = self._download_xml(info_url, page_title) - - formats = [] - rex = re.compile(r'''(?x) - (?P<width>[0-9]+)x(?P<height>[0-9]+) - (?:_(?P<bitrate>[0-9]+)k)? - ''') - for f in info.findall('files/file'): - video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip()) - fdct = { - 'format_id': f.attrib['bitrate'], - 'url': video_url, - } - - mf = rex.match(f.attrib['bitrate']) - if mf: - fdct['width'] = int(mf.group('width')) - fdct['height'] = int(mf.group('height')) - fdct['tbr'] = int_or_none(mf.group('bitrate')) - else: - mf = rex.search(f.text) - if mf: - fdct['width'] = int(mf.group('width')) - fdct['height'] = int(mf.group('height')) - fdct['tbr'] = int_or_none(mf.group('bitrate')) - else: - mi = re.match(r'ios_(audio|[0-9]+)$', f.attrib['bitrate']) - if mi: - if mi.group(1) == 'audio': - fdct['vcodec'] = 'none' - fdct['ext'] = 'm4a' - else: - fdct['tbr'] = int(mi.group(1)) - - formats.append(fdct) - - self._sort_formats(formats) - - thumbnails = [{ - 'height': int(t.attrib['height']), - 'width': int(t.attrib['width']), - 'url': t.text, - } for t in info.findall('images/image')] - - metas_el = info.find('metas') - upload_date = ( - metas_el.attrib.get('version') if metas_el is not None else None) - - duration_el = info.find('length') - duration = parse_duration(duration_el.text) - - return { - 'id': info.attrib['id'], - 'title': info.find('headline').text, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': info.find('description').text, - 'duration': duration, - 'upload_date': upload_date, - } + sub_domain, path, page_title = re.match(self._VALID_URL, url).groups() + if sub_domain not in ('money', 'edition'): + sub_domain = 'edition' + config = self._CONFIG[sub_domain] + return self._extract_cvp_info( + config['data_src'] % path, page_title, { + 'default': { + 'media_src': config['media_src'], + } + }) class CNNBlogsIE(InfoExtractor): @@ -132,6 +112,7 @@ class CNNBlogsIE(InfoExtractor): 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', 'upload_date': '20140209', }, + 'expected_warnings': ['Failed to download m3u8 information'], 'add_ie': ['CNN'], } @@ -146,7 +127,7 @@ class CNNBlogsIE(InfoExtractor): class CNNArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)' + _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' _TEST = { 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', @@ -154,9 +135,10 @@ class CNNArticleIE(InfoExtractor): 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', 'ext': 'mp4', 'title': 'Obama: Cyberattack not an act of war', - 'description': 'md5:51ce6750450603795cad0cdfbd7d05c5', + 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', 'upload_date': '20141221', }, + 'expected_warnings': ['Failed to download m3u8 information'], 'add_ie': ['CNN'], } diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py index f9e84193d..18c734766 100644 --- a/youtube_dl/extractor/collegerama.py +++ b/youtube_dl/extractor/collegerama.py @@ -21,7 +21,7 @@ class CollegeRamaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', 'description': '', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 7713.088, 'timestamp': 1413309600, 'upload_date': '20141014', diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 747c245c8..588aad0d9 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 88346dde7..4cac29415 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (video-clips|episodes|cc-studios|video-collections|full-episodes|shows) + (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes))) /(?P<title>.*)''' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' @@ -27,6 +27,32 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }] +class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ + (?:full-episodes|shows(?=/[^/]+/full-episodes)) + /(?P<id>[^?]+)''' + _FEED_URL = 'http://comedycentral.com/feeds/mrss/' + + _TESTS = [{ + 'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028', + 'info_dict': { + 'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."', + 'title': 'November 28, 2016 - Ryan Speedo Green', + }, + 'playlist_count': 4, + }, { + 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') + videos_info = self._get_videos_info(mgid) + return videos_info + + class ToshIE(MTVServicesInfoExtractor): IE_DESC = 'Tosh.0' _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' @@ -45,7 +71,7 @@ class ToshIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', 'description': 'Tosh asked fans to share their summer plans.', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', # It's really reported to be published on year 2077 'upload_date': '20770610', 'timestamp': 3390510600, @@ -59,12 +85,6 @@ class ToshIE(MTVServicesInfoExtractor): 'only_matching': True, }] - @classmethod - def _transform_rtmp_url(cls, rtmp_video_url): - new_urls = super(ToshIE, cls)._transform_rtmp_url(rtmp_video_url) - new_urls['rtmp'] = rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm') - return new_urls - class ComedyCentralTVIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 70909fc1c..2c8ec1417 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,6 +21,7 @@ from ..compat import ( compat_os_name, compat_str, compat_urllib_error, + compat_urllib_parse_unquote, compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, @@ -29,6 +30,7 @@ from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, + base_url, bug_reports_message, clean_html, compiled_regex_type, @@ -57,6 +59,7 @@ from ..utils import ( parse_m3u8_attributes, extract_attributes, parse_codecs, + urljoin, ) @@ -87,6 +90,9 @@ class InfoExtractor(object): Potential fields: * url Mandatory. The URL of the video file + * manifest_url + The URL of the manifest file in case of + fragmented media (DASH, hls, hds) * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). @@ -115,6 +121,21 @@ class InfoExtractor(object): download, lower-case. "http", "https", "rtsp", "rtmp", "rtmpe", "m3u8", "m3u8_native" or "http_dash_segments". + * fragment_base_url + Base URL for fragments. Each fragment's path + value (if present) will be relative to + this URL. + * fragments A list of fragments of a fragmented media. + Each fragment entry must contain either an url + or a path. If an url is present it should be + considered by a client. Otherwise both path and + fragment_base_url must be present. Here is + the list of all potential fields: + * "url" - fragment's URL + * "path" - fragment's path relative to + fragment_base_url + * "duration" (optional, int or float) + * "filesize" (optional, int) * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. @@ -178,9 +199,10 @@ class InfoExtractor(object): uploader_url: Full URL to a personal webpage of the video uploader. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format - {language: subformats}. "subformats" is a list sorted from - lower to higher preference, each element is a dictionary - with the "ext" entry and one of: + {tag: subformats}. "tag" is usually a language code, and + "subformats" is a list sorted from lower to higher + preference, each element is a dictionary with the "ext" + entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file "ext" will be calculated from URL if missing @@ -226,7 +248,7 @@ class InfoExtractor(object): chapter_id: Id of the chapter the video belongs to, as a unicode string. The following fields should only be used when the video is an episode of some - series or programme: + series, programme or podcast: series: Title of the series or programme the video episode belongs to. season: Title of the season the video episode belongs to. @@ -662,35 +684,48 @@ class InfoExtractor(object): else: return res - def _get_login_info(self): + def _get_netrc_login_info(self, netrc_machine=None): + username = None + password = None + netrc_machine = netrc_machine or self._NETRC_MACHINE + + if self._downloader.params.get('usenetrc', False): + try: + info = netrc.netrc().authenticators(netrc_machine) + if info is not None: + username = info[0] + password = info[2] + else: + raise netrc.NetrcParseError( + 'No authenticators for %s' % netrc_machine) + except (IOError, netrc.NetrcParseError) as err: + self._downloader.report_warning( + 'parsing .netrc: %s' % error_to_compat_str(err)) + + return username, password + + def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): """ Get the login info as (username, password) - It will look in the netrc file using the _NETRC_MACHINE value + First look for the manually specified credentials using username_option + and password_option as keys in params dictionary. If no such credentials + available look in the netrc file using the netrc_machine or _NETRC_MACHINE + value. If there's no info available, return (None, None) """ if self._downloader is None: return (None, None) - username = None - password = None downloader_params = self._downloader.params # Attempt to use provided username and password or .netrc data - if downloader_params.get('username') is not None: - username = downloader_params['username'] - password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) + if downloader_params.get(username_option) is not None: + username = downloader_params[username_option] + password = downloader_params[password_option] + else: + username, password = self._get_netrc_login_info(netrc_machine) - return (username, password) + return username, password def _get_tfa_info(self, note='two-factor verification code'): """ @@ -816,11 +851,14 @@ class InfoExtractor(object): json_ld = self._search_regex( r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', html, 'JSON-LD', group='json_ld', **kwargs) + default = kwargs.get('default', NO_DEFAULT) if not json_ld: - return {} - return self._json_ld( - json_ld, video_id, fatal=kwargs.get('fatal', True), - expected_type=expected_type) + return default if default is not NO_DEFAULT else {} + # JSON-LD may be malformed and thus `fatal` should be respected. + # At the same time `default` may be passed that assumes `fatal=False` + # for _search_regex. Let's simulate the same behavior here as well. + fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False + return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): @@ -846,7 +884,7 @@ class InfoExtractor(object): part_of_season = e.get('partOfSeason') if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) - part_of_series = e.get('partOfSeries') + part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': info['series'] = unescapeHTML(part_of_series.get('name')) elif item_type == 'Article': @@ -860,7 +898,7 @@ class InfoExtractor(object): 'url': e.get('contentUrl'), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl'), + 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), 'filesize': float_or_none(e.get('contentSize')), @@ -875,16 +913,16 @@ class InfoExtractor(object): def _hidden_inputs(html): html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) hidden_inputs = {} - for input in re.findall(r'(?i)<input([^>]+)>', html): - if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): + for input in re.findall(r'(?i)(<input[^>]+>)', html): + attrs = extract_attributes(input) + if not input: continue - name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input) - if not name: + if attrs.get('type') not in ('hidden', 'submit'): continue - value = re.search(r'value=(["\'])(?P<value>.*?)\1', input) - if not value: - continue - hidden_inputs[name.group('value')] = value.group('value') + name = attrs.get('name') or attrs.get('id') + value = attrs.get('value') + if name and value is not None: + hidden_inputs[name] = value return hidden_inputs def _form_hidden_inputs(self, form_id, html): @@ -987,13 +1025,13 @@ class InfoExtractor(object): unique_formats.append(f) formats[:] = unique_formats - def _is_valid_url(self, url, video_id, item='video'): + def _is_valid_url(self, url, video_id, item='video', headers={}): url = self._proto_relative_url(url, scheme='http:') # For now assume non HTTP(S) URLs always valid if not (url.startswith('http://') or url.startswith('https://')): return True try: - self._request_webpage(url, video_id, 'Checking %s URL' % item) + self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True except ExtractorError as e: if isinstance(e.cause, compat_urllib_error.URLError): @@ -1075,6 +1113,13 @@ class InfoExtractor(object): manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], 'bootstrap info', default=None) + vcodec = None + mime_type = xpath_text( + manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'], + 'base URL', default=None) + if mime_type and mime_type.startswith('audio/'): + vcodec = 'none' + for i, media_el in enumerate(media_nodes): tbr = int_or_none(media_el.attrib.get('bitrate')) width = int_or_none(media_el.attrib.get('width')) @@ -1115,6 +1160,7 @@ class InfoExtractor(object): 'width': f.get('width') or width, 'height': f.get('height') or height, 'format_id': f.get('format_id') if not tbr else format_id, + 'vcodec': vcodec, }) formats.extend(f4m_formats) continue @@ -1126,10 +1172,12 @@ class InfoExtractor(object): formats.append({ 'format_id': format_id, 'url': manifest_url, + 'manifest_url': manifest_url, 'ext': 'flv' if bootstrap_info is not None else None, 'tbr': tbr, 'width': width, 'height': height, + 'vcodec': vcodec, 'preference': preference, }) return formats @@ -1140,7 +1188,7 @@ class InfoExtractor(object): 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': preference - 1 if preference else -1, + 'preference': preference - 100 if preference else -100, 'resolution': 'multiple', 'format_note': 'Quality selection URL', } @@ -1150,13 +1198,6 @@ class InfoExtractor(object): m3u8_id=None, note=None, errnote=None, fatal=True, live=False): - formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] - - format_url = lambda u: ( - u - if re.match(r'^https?://', u) - else compat_urlparse.urljoin(m3u8_url, u)) - res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', @@ -1167,6 +1208,13 @@ class InfoExtractor(object): m3u8_doc, urlh = res m3u8_url = urlh.geturl() + formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] + + format_url = lambda u: ( + u + if re.match(r'^https?://', u) + else compat_urlparse.urljoin(m3u8_url, u)) + # We should try extracting formats only from master playlists [1], i.e. # playlists that describe available qualities. On the other hand media # playlists [2] should be returned as is since they contain just the media @@ -1188,35 +1236,62 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, }] - last_info = None - last_media = None + audio_in_video_stream = {} + last_info = {} + last_media = {} for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): last_info = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - last_media = parse_m3u8_attributes(line) + media = parse_m3u8_attributes(line) + media_type = media.get('TYPE') + if media_type in ('VIDEO', 'AUDIO'): + group_id = media.get('GROUP-ID') + media_url = media.get('URI') + if media_url: + format_id = [] + for v in (group_id, media.get('NAME')): + if v: + format_id.append(v) + f = { + 'format_id': '-'.join(format_id), + 'url': format_url(media_url), + 'language': media.get('LANGUAGE'), + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + } + if media_type == 'AUDIO': + f['vcodec'] = 'none' + if group_id and not audio_in_video_stream.get(group_id): + audio_in_video_stream[group_id] = False + formats.append(f) + else: + # When there is no URI in EXT-X-MEDIA let this tag's + # data be used by regular URI lines below + last_media = media + if media_type == 'AUDIO' and group_id: + audio_in_video_stream[group_id] = True elif line.startswith('#') or not line.strip(): continue else: - if last_info is None: - formats.append({'url': format_url(line)}) - continue - tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) + tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000) format_id = [] if m3u8_id: format_id.append(m3u8_id) - last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None # Despite specification does not mention NAME attribute for # EXT-X-STREAM-INF it still sometimes may be present - stream_name = last_info.get('NAME') or last_media_name + stream_name = last_info.get('NAME') or last_media.get('NAME') # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. if not live: format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) + manifest_url = format_url(line.strip()) f = { 'format_id': '-'.join(format_id), - 'url': format_url(line.strip()), + 'url': manifest_url, + 'manifest_url': manifest_url, 'tbr': tbr, 'ext': ext, 'fps': float_or_none(last_info.get('FRAME-RATE')), @@ -1225,9 +1300,10 @@ class InfoExtractor(object): } resolution = last_info.get('RESOLUTION') if resolution: - width_str, height_str = resolution.split('x') - f['width'] = int(width_str) - f['height'] = int(height_str) + mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) + if mobj: + f['width'] = int(mobj.group('width')) + f['height'] = int(mobj.group('height')) # Unified Streaming Platform mobj = re.search( r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) @@ -1239,11 +1315,12 @@ class InfoExtractor(object): 'abr': abr, }) f.update(parse_codecs(last_info.get('CODECS'))) - if last_media is not None: - f['m3u8_media'] = last_media - last_media = None + if audio_in_video_stream.get(last_info.get('AUDIO')) is False: + # TODO: update acodec for for audio only formats with the same GROUP-ID + f['acodec'] = 'none' formats.append(f) last_info = {} + last_media = {} return formats @staticmethod @@ -1487,12 +1564,13 @@ class InfoExtractor(object): if res is False: return [] mpd, urlh = res - mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group() + mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( - compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) + compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, + formats_dict=formats_dict, mpd_url=mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): + def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): """ Parse formats from MPD manifest. References: @@ -1513,52 +1591,60 @@ class InfoExtractor(object): def extract_multisegment_info(element, ms_parent_info): ms_info = ms_parent_info.copy() + + # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some + # common attributes and elements. We will only extract relevant + # for us. + def extract_common(source): + segment_timeline = source.find(_add_ns('SegmentTimeline')) + if segment_timeline is not None: + s_e = segment_timeline.findall(_add_ns('S')) + if s_e: + ms_info['total_number'] = 0 + ms_info['s'] = [] + for s in s_e: + r = int(s.get('r', 0)) + ms_info['total_number'] += 1 + r + ms_info['s'].append({ + 't': int(s.get('t', 0)), + # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) + 'd': int(s.attrib['d']), + 'r': r, + }) + start_number = source.get('startNumber') + if start_number: + ms_info['start_number'] = int(start_number) + timescale = source.get('timescale') + if timescale: + ms_info['timescale'] = int(timescale) + segment_duration = source.get('duration') + if segment_duration: + ms_info['segment_duration'] = int(segment_duration) + + def extract_Initialization(source): + initialization = source.find(_add_ns('Initialization')) + if initialization is not None: + ms_info['initialization_url'] = initialization.attrib['sourceURL'] + segment_list = element.find(_add_ns('SegmentList')) if segment_list is not None: + extract_common(segment_list) + extract_Initialization(segment_list) segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) if segment_urls_e: ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] - initialization = segment_list.find(_add_ns('Initialization')) - if initialization is not None: - ms_info['initialization_url'] = initialization.attrib['sourceURL'] else: segment_template = element.find(_add_ns('SegmentTemplate')) if segment_template is not None: - start_number = segment_template.get('startNumber') - if start_number: - ms_info['start_number'] = int(start_number) - segment_timeline = segment_template.find(_add_ns('SegmentTimeline')) - if segment_timeline is not None: - s_e = segment_timeline.findall(_add_ns('S')) - if s_e: - ms_info['total_number'] = 0 - ms_info['s'] = [] - for s in s_e: - r = int(s.get('r', 0)) - ms_info['total_number'] += 1 + r - ms_info['s'].append({ - 't': int(s.get('t', 0)), - # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) - 'd': int(s.attrib['d']), - 'r': r, - }) - else: - timescale = segment_template.get('timescale') - if timescale: - ms_info['timescale'] = int(timescale) - segment_duration = segment_template.get('duration') - if segment_duration: - ms_info['segment_duration'] = int(segment_duration) - media_template = segment_template.get('media') - if media_template: - ms_info['media_template'] = media_template + extract_common(segment_template) + media = segment_template.get('media') + if media: + ms_info['media'] = media initialization = segment_template.get('initialization') if initialization: - ms_info['initialization_url'] = initialization + ms_info['initialization'] = initialization else: - initialization = segment_template.find(_add_ns('Initialization')) - if initialization is not None: - ms_info['initialization_url'] = initialization.attrib['sourceURL'] + extract_Initialization(segment_template) return ms_info mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) @@ -1600,74 +1686,127 @@ class InfoExtractor(object): lang = representation_attrib.get('lang') url_el = representation.find(_add_ns('BaseURL')) filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) + bandwidth = int_or_none(representation_attrib.get('bandwidth')) f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, + 'manifest_url': mpd_url, 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), - 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), + 'tbr': int_or_none(bandwidth, 1000), 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 'fps': int_or_none(representation_attrib.get('frameRate')), - 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'), - 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, 'filesize': filesize, } + f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) - if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: - if 'total_number' not in representation_ms_info and 'segment_duration': - segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) - media_template = representation_ms_info['media_template'] - media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) - media_template.replace('$$', '$') + + def prepare_template(template_name, identifiers): + t = representation_ms_info[template_name] + t = t.replace('$RepresentationID$', representation_id) + t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) + t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) + t.replace('$$', '$') + return t + + # @initialization is a regular template like @media one + # so it should be handled just the same way (see + # https://github.com/rg3/youtube-dl/issues/11605) + if 'initialization' in representation_ms_info: + initialization_template = prepare_template( + 'initialization', + # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and + # $Time$ shall not be included for @initialization thus + # only $Bandwidth$ remains + ('Bandwidth', )) + representation_ms_info['initialization_url'] = initialization_template % { + 'Bandwidth': bandwidth, + } + + if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: + + media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ # can't be used at the same time - if '%(Number' in media_template: - representation_ms_info['segment_urls'] = [ - media_template % { + if '%(Number' in media_template and 's' not in representation_ms_info: + segment_duration = None + if 'total_number' not in representation_ms_info and 'segment_duration': + segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) + representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) + representation_ms_info['fragments'] = [{ + 'url': media_template % { 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth'), - } - for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] + 'Bandwidth': bandwidth, + }, + 'duration': segment_duration, + } for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] else: - representation_ms_info['segment_urls'] = [] + # $Number*$ or $Time$ in media template with S list available + # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg + # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 + representation_ms_info['fragments'] = [] segment_time = 0 + segment_d = None + segment_number = representation_ms_info['start_number'] def add_segment_url(): - representation_ms_info['segment_urls'].append( - media_template % { - 'Time': segment_time, - 'Bandwidth': representation_attrib.get('bandwidth'), - } - ) + segment_url = media_template % { + 'Time': segment_time, + 'Bandwidth': bandwidth, + 'Number': segment_number, + } + representation_ms_info['fragments'].append({ + 'url': segment_url, + 'duration': float_or_none(segment_d, representation_ms_info['timescale']), + }) for num, s in enumerate(representation_ms_info['s']): segment_time = s.get('t') or segment_time + segment_d = s['d'] add_segment_url() + segment_number += 1 for r in range(s.get('r', 0)): - segment_time += s['d'] + segment_time += segment_d add_segment_url() - segment_time += s['d'] - if 'segment_urls' in representation_ms_info: + segment_number += 1 + segment_time += segment_d + elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: + # No media template + # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI + # or any YouTube dashsegments video + fragments = [] + segment_index = 0 + timescale = representation_ms_info['timescale'] + for s in representation_ms_info['s']: + duration = float_or_none(s['d'], timescale) + for r in range(s.get('r', 0) + 1): + fragments.append({ + 'url': representation_ms_info['segment_urls'][segment_index], + 'duration': duration, + }) + segment_index += 1 + representation_ms_info['fragments'] = fragments + # NB: MPD manifest may contain direct URLs to unfragmented media. + # No fragments key is present in this case. + if 'fragments' in representation_ms_info: f.update({ - 'segment_urls': representation_ms_info['segment_urls'], + 'fragments': [], 'protocol': 'http_dash_segments', }) if 'initialization_url' in representation_ms_info: - initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) - f.update({ - 'initialization_url': initialization_url, - }) + initialization_url = representation_ms_info['initialization_url'] if not f.get('url'): f['url'] = initialization_url + f['fragments'].append({'url': initialization_url}) + f['fragments'].extend(representation_ms_info['fragments']) + for fragment in f['fragments']: + fragment['url'] = urljoin(base_url, fragment['url']) try: existing_format = next( fo for fo in formats @@ -1682,7 +1821,106 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats - def _parse_html5_media_entries(self, base_url, webpage): + def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): + res = self._download_webpage_handle( + ism_url, video_id, + note=note or 'Downloading ISM manifest', + errnote=errnote or 'Failed to download ISM manifest', + fatal=fatal) + if res is False: + return [] + ism, urlh = res + + return self._parse_ism_formats( + compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) + + def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): + if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: + return [] + + duration = int(ism_doc.attrib['Duration']) + timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 + + formats = [] + for stream in ism_doc.findall('StreamIndex'): + stream_type = stream.get('Type') + if stream_type not in ('video', 'audio'): + continue + url_pattern = stream.attrib['Url'] + stream_timescale = int_or_none(stream.get('TimeScale')) or timescale + stream_name = stream.get('Name') + for track in stream.findall('QualityLevel'): + fourcc = track.get('FourCC') + # TODO: add support for WVC1 and WMAP + if fourcc not in ('H264', 'AVC1', 'AACL'): + self.report_warning('%s is not a supported codec' % fourcc) + continue + tbr = int(track.attrib['Bitrate']) // 1000 + width = int_or_none(track.get('MaxWidth')) + height = int_or_none(track.get('MaxHeight')) + sampling_rate = int_or_none(track.get('SamplingRate')) + + track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) + track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) + + fragments = [] + fragment_ctx = { + 'time': 0, + } + stream_fragments = stream.findall('c') + for stream_fragment_index, stream_fragment in enumerate(stream_fragments): + fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time'] + fragment_repeat = int_or_none(stream_fragment.get('r')) or 1 + fragment_ctx['duration'] = int_or_none(stream_fragment.get('d')) + if not fragment_ctx['duration']: + try: + next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) + except IndexError: + next_fragment_time = duration + fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat + for _ in range(fragment_repeat): + fragments.append({ + 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), + 'duration': fragment_ctx['duration'] / stream_timescale, + }) + fragment_ctx['time'] += fragment_ctx['duration'] + + format_id = [] + if ism_id: + format_id.append(ism_id) + if stream_name: + format_id.append(stream_name) + format_id.append(compat_str(tbr)) + + formats.append({ + 'format_id': '-'.join(format_id), + 'url': ism_url, + 'manifest_url': ism_url, + 'ext': 'ismv' if stream_type == 'video' else 'isma', + 'width': width, + 'height': height, + 'tbr': tbr, + 'asr': sampling_rate, + 'vcodec': 'none' if stream_type == 'audio' else fourcc, + 'acodec': 'none' if stream_type == 'video' else fourcc, + 'protocol': 'ism', + 'fragments': fragments, + '_download_params': { + 'duration': duration, + 'timescale': stream_timescale, + 'width': width or 0, + 'height': height or 0, + 'fourcc': fourcc, + 'codec_private_data': track.get('CodecPrivateData'), + 'sampling_rate': sampling_rate, + 'channels': int_or_none(track.get('Channels', 2)), + 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), + 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), + }, + }) + return formats + + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -1697,8 +1935,32 @@ class InfoExtractor(object): return f return {} + def _media_formats(src, cur_media_type): + full_url = absolute_url(src) + ext = determine_ext(full_url) + if ext == 'm3u8': + is_plain_url = False + formats = self._extract_m3u8_formats( + full_url, video_id, ext='mp4', + entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id) + elif ext == 'mpd': + is_plain_url = False + formats = self._extract_mpd_formats( + full_url, video_id, mpd_id=mpd_id) + else: + is_plain_url = True + formats = [{ + 'url': full_url, + 'vcodec': 'none' if cur_media_type == 'audio' else None, + }] + return is_plain_url, formats + entries = [] - for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage): + media_tags = [(media_tag, media_type, '') + for media_tag, media_type + in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] + media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage)) + for media_tag, media_type, media_content in media_tags: media_info = { 'formats': [], 'subtitles': {}, @@ -1706,10 +1968,8 @@ class InfoExtractor(object): media_attributes = extract_attributes(media_tag) src = media_attributes.get('src') if src: - media_info['formats'].append({ - 'url': absolute_url(src), - 'vcodec': 'none' if media_type == 'audio' else None, - }) + _, formats = _media_formats(src, media_type) + media_info['formats'].extend(formats) media_info['thumbnail'] = media_attributes.get('poster') if media_content: for source_tag in re.findall(r'<source[^>]+>', media_content): @@ -1717,16 +1977,17 @@ class InfoExtractor(object): src = source_attributes.get('src') if not src: continue - f = parse_content_type(source_attributes.get('type')) - f.update({ - 'url': absolute_url(src), - 'vcodec': 'none' if media_type == 'audio' else None, - }) - media_info['formats'].append(f) + is_plain_url, formats = _media_formats(src, media_type) + if is_plain_url: + f = parse_content_type(source_attributes.get('type')) + f.update(formats[0]) + media_info['formats'].append(f) + else: + media_info['formats'].extend(formats) for track_tag in re.findall(r'<track[^>]+>', media_content): track_attributes = extract_attributes(track_tag) kind = track_attributes.get('kind') - if not kind or kind == 'subtitles': + if not kind or kind in ('subtitles', 'captions'): src = track_attributes.get('src') if not src: continue @@ -1734,10 +1995,76 @@ class InfoExtractor(object): media_info['subtitles'].setdefault(lang, []).append({ 'url': absolute_url(src), }) - if media_info['formats']: + if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries + def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): + formats = [] + hdcore_sign = 'hdcore=3.7.0' + f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + hds_host = hosts.get('hds') + if hds_host: + f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) + if 'hdcore=' not in f4m_url: + f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign + f4m_formats = self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False) + for entry in f4m_formats: + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.extend(f4m_formats) + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + hls_host = hosts.get('hls') + if hls_host: + m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + return formats + + def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): + url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) + url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url') + http_base_url = 'http' + url_base + formats = [] + if 'm3u8' not in skip_protocols: + formats.extend(self._extract_m3u8_formats( + http_base_url + '/playlist.m3u8', video_id, 'mp4', + m3u8_entry_protocol, m3u8_id='hls', fatal=False)) + if 'f4m' not in skip_protocols: + formats.extend(self._extract_f4m_formats( + http_base_url + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) + if 'dash' not in skip_protocols: + formats.extend(self._extract_mpd_formats( + http_base_url + '/manifest.mpd', + video_id, mpd_id='dash', fatal=False)) + if re.search(r'(?:/smil:|\.smil)', url_base): + if 'smil' not in skip_protocols: + rtmp_formats = self._extract_smil_formats( + http_base_url + '/jwplayer.smil', + video_id, fatal=False) + for rtmp_format in rtmp_formats: + rtsp_format = rtmp_format.copy() + rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'protocol': 'rtsp', + }) + formats.extend([rtmp_format, rtsp_format]) + else: + for protocol in ('rtmp', 'rtsp'): + if protocol not in skip_protocols: + formats.append({ + 'url': protocol + url_base, + 'format_id': protocol, + 'protocol': protocol, + }) + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -1858,6 +2185,12 @@ class InfoExtractor(object): headers['Ytdl-request-proxy'] = geo_verification_proxy return headers + def _generic_id(self, url): + return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + + def _generic_title(self, url): + return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py index 5d130a170..d98331a4e 100644 --- a/youtube_dl/extractor/commonprotocols.py +++ b/youtube_dl/extractor/commonprotocols.py @@ -1,13 +1,9 @@ from __future__ import unicode_literals -import os - from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_unquote, compat_urlparse, ) -from ..utils import url_basename class RtmpIE(InfoExtractor): @@ -23,8 +19,8 @@ class RtmpIE(InfoExtractor): }] def _real_extract(self, url): - video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) - title = compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + video_id = self._generic_id(url) + title = self._generic_title(url) return { 'id': video_id, 'title': title, @@ -34,3 +30,31 @@ class RtmpIE(InfoExtractor): 'format_id': compat_urlparse.urlparse(url).scheme, }], } + + +class MmsIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'(?i)mms://.+' + + _TEST = { + # Direct MMS link + 'url': 'mms://kentro.kaist.ac.kr/200907/MilesReid(0709).wmv', + 'info_dict': { + 'id': 'MilesReid(0709)', + 'ext': 'wmv', + 'title': 'MilesReid(0709)', + }, + 'params': { + 'skip_download': True, # rtsp downloads, requiring mplayer or mpv + }, + } + + def _real_extract(self, url): + video_id = self._generic_id(url) + title = self._generic_title(url) + + return { + 'id': video_id, + 'title': title, + 'url': url, + } diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 15fabbb1c..8d8f60598 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -143,7 +143,8 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) - info = self._search_json_ld(webpage, video_id) if url_type != 'embed' else {} + info = self._search_json_ld( + webpage, video_id, fatal=False) if url_type != 'embed' else {} info.update({ 'id': video_id, 'formats': formats, diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py index a901b8d22..5fa1f006b 100644 --- a/youtube_dl/extractor/coub.py +++ b/youtube_dl/extractor/coub.py @@ -20,7 +20,7 @@ class CoubIE(InfoExtractor): 'id': '5u5n1', 'ext': 'mp4', 'title': 'The Matrix Moonwalk', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 4.6, 'timestamp': 1428527772, 'upload_date': '20150408', diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index 79238cce7..377fb45e9 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -1,19 +1,29 @@ # coding: utf-8 -from __future__ import unicode_literals +from __future__ import unicode_literals, division from .common import InfoExtractor from ..utils import int_or_none class CrackleIE(InfoExtractor): - _VALID_URL = r'(?:crackle:|https?://(?:www\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' + _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' _TEST = { - 'url': 'http://www.crackle.com/the-art-of-more/2496419', + 'url': 'http://www.crackle.com/comedians-in-cars-getting-coffee/2498934', 'info_dict': { - 'id': '2496419', + 'id': '2498934', 'ext': 'mp4', - 'title': 'Heavy Lies the Head', - 'description': 'md5:bb56aa0708fe7b9a4861535f15c3abca', + 'title': 'Everybody Respects A Bloody Nose', + 'description': 'Jerry is kaffeeklatsching in L.A. with funnyman J.B. Smoove (Saturday Night Live, Real Husbands of Hollywood). They’re headed for brew at 10 Speed Coffee in a 1964 Studebaker Avanti.', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 906, + 'series': 'Comedians In Cars Getting Coffee', + 'season_number': 8, + 'episode_number': 4, + 'subtitles': { + 'en-US': [{ + 'ext': 'ttml', + }] + }, }, 'params': { # m3u8 download @@ -21,10 +31,30 @@ class CrackleIE(InfoExtractor): } } - # extracted from http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx - _SUBTITLE_SERVER = 'http://web-us-az.crackle.com' - _UPLYNK_OWNER_ID = 'e8773f7770a44dbd886eee4fca16a66b' - _THUMBNAIL_TEMPLATE = 'http://images-us-am.crackle.com/%stnl_1920x1080.jpg?ts=20140107233116?c=635333335057637614' + _THUMBNAIL_RES = [ + (120, 90), + (208, 156), + (220, 124), + (220, 220), + (240, 180), + (250, 141), + (315, 236), + (320, 180), + (360, 203), + (400, 300), + (421, 316), + (460, 330), + (460, 460), + (462, 260), + (480, 270), + (587, 330), + (640, 480), + (700, 330), + (700, 394), + (854, 480), + (1024, 1024), + (1920, 1080), + ] # extracted from http://legacyweb-us.crackle.com/flash/ReferrerRedirect.ashx _MEDIA_FILE_SLOTS = { @@ -48,19 +78,32 @@ class CrackleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + + config_doc = self._download_xml( + 'http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx?site=16', + video_id, 'Downloading config') + item = self._download_xml( 'http://legacyweb-us.crackle.com/app/revamp/vidwallcache.aspx?flags=-1&fm=%s' % video_id, - video_id).find('i') + video_id, headers=self.geo_verification_headers()).find('i') title = item.attrib['t'] - thumbnail = None subtitles = {} formats = self._extract_m3u8_formats( - 'http://content.uplynk.com/ext/%s/%s.m3u8' % (self._UPLYNK_OWNER_ID, video_id), + 'http://content.uplynk.com/ext/%s/%s.m3u8' % (config_doc.attrib['strUplynkOwnerId'], video_id), video_id, 'mp4', m3u8_id='hls', fatal=None) + thumbnails = [] path = item.attrib.get('p') if path: - thumbnail = self._THUMBNAIL_TEMPLATE % path + for width, height in self._THUMBNAIL_RES: + res = '%dx%d' % (width, height) + thumbnails.append({ + 'id': res, + 'url': 'http://images-us-am.crackle.com/%stnl_%s.jpg' % (path, res), + 'width': width, + 'height': height, + 'resolution': res, + }) http_base_url = 'http://ahttp.crackle.com/' + path for mfs_path, mfs_info in self._MEDIA_FILE_SLOTS.items(): formats.append({ @@ -75,21 +118,22 @@ class CrackleIE(InfoExtractor): if locale and v: if locale not in subtitles: subtitles[locale] = [] - subtitles[locale] = [{ - 'url': '%s/%s%s_%s.xml' % (self._SUBTITLE_SERVER, path, locale, v), - 'ext': 'ttml', - }] + for url_ext, ext in (('vtt', 'vtt'), ('xml', 'tt')): + subtitles.setdefault(locale, []).append({ + 'url': '%s/%s%s_%s.%s' % (config_doc.attrib['strSubtitleServer'], path, locale, v, url_ext), + 'ext': ext, + }) self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) return { 'id': video_id, 'title': title, 'description': item.attrib.get('d'), - 'duration': int(item.attrib.get('r'), 16) if item.attrib.get('r') else None, + 'duration': int(item.attrib.get('r'), 16) / 1000 if item.attrib.get('r') else None, 'series': item.attrib.get('sn'), 'season_number': int_or_none(item.attrib.get('se')), 'episode_number': int_or_none(item.attrib.get('ep')), - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, } diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index dedb810a0..f7815b905 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -1,13 +1,11 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor class CriterionIE(InfoExtractor): - _VALID_URL = r'https?://www\.criterion\.com/films/(?P<id>[0-9]+)-.+' + _VALID_URL = r'https?://(?:www\.)?criterion\.com/films/(?P<id>[0-9]+)-.+' _TEST = { 'url': 'http://www.criterion.com/films/184-le-samourai', 'md5': 'bc51beba55685509883a9a7830919ec3', @@ -16,20 +14,20 @@ class CriterionIE(InfoExtractor): 'ext': 'mp4', 'title': 'Le Samouraï', 'description': 'md5:a2b4b116326558149bef81f76dcbb93f', + 'thumbnail': r're:^https?://.*\.jpg$', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) final_url = self._search_regex( - r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') + r'so\.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') title = self._og_search_title(webpage) description = self._html_search_meta('description', webpage) thumbnail = self._search_regex( - r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', + r'so\.addVariable\("thumbnailURL", "(.+?)"\)\;', webpage, 'thumbnail url') return { diff --git a/youtube_dl/extractor/crooksandliars.py b/youtube_dl/extractor/crooksandliars.py index 443eb7691..7fb782db7 100644 --- a/youtube_dl/extractor/crooksandliars.py +++ b/youtube_dl/extractor/crooksandliars.py @@ -16,7 +16,7 @@ class CrooksAndLiarsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!', 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1428207000, 'upload_date': '20150405', 'uploader': 'Heather', diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 90a64303d..109d1c5a8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -34,22 +34,58 @@ from ..aes import ( class CrunchyrollBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.crunchyroll.com/login' + _LOGIN_FORM = 'login_form' _NETRC_MACHINE = 'crunchyroll' def _login(self): (username, password) = self._get_login_info() if username is None: return - self.report_login() - login_url = 'https://www.crunchyroll.com/?a=formhandler' - data = urlencode_postdata({ - 'formname': 'RpcApiUser_Login', - 'name': username, - 'password': password, + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + def is_logged(webpage): + return '<title>Redirecting' in webpage + + # Already logged in + if is_logged(login_page): + return + + login_form_str = self._search_regex( + r'(?P<form><form[^>]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM, + login_page, 'login form', group='form') + + post_url = extract_attributes(login_form_str).get('action') + if not post_url: + post_url = self._LOGIN_URL + elif not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + login_form = self._form_hidden_inputs(self._LOGIN_FORM, login_page) + + login_form.update({ + 'login_form[name]': username, + 'login_form[password]': password, }) - login_request = sanitized_Request(login_url, data) - login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - self._download_webpage(login_request, None, False, 'Wrong login info') + + response = self._download_webpage( + post_url, None, 'Logging in', 'Wrong login info', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + # Successful login + if is_logged(response): + return + + error = self._html_search_regex( + '(?s)<ul[^>]+class=["\']messages["\'][^>]*>(.+?)</ul>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + + raise ExtractorError('Unable to log in') def _real_initialize(self): self._login() @@ -106,7 +142,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'ext': 'flv', 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Danny Choo Network', 'upload_date': '20120213', }, @@ -114,6 +150,41 @@ class CrunchyrollIE(CrunchyrollBaseIE): # rtmp 'skip_download': True, }, + 'skip': 'Video gone', + }, { + 'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409', + 'info_dict': { + 'id': '702409', + 'ext': 'mp4', + 'title': 'Re:ZERO -Starting Life in Another World- Episode 5 – The Morning of Our Promise Is Still Distant', + 'description': 'md5:97664de1ab24bbf77a9c01918cb7dca9', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'TV TOKYO', + 'upload_date': '20160508', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589', + 'info_dict': { + 'id': '727589', + 'ext': 'mp4', + 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!", + 'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Kadokawa Pictures Inc.', + 'upload_date': '20170118', + 'series': "KONOSUBA -God's blessing on this wonderful world!", + 'season_number': 2, + 'episode': 'Give Me Deliverance from this Judicial Injustice!', + 'episode_number': 1, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, @@ -184,8 +255,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style'] output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x'] output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y'] - output += """ScaledBorderAndShadow: yes - + output += """ [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding """ @@ -336,9 +406,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if video_encode_id in video_encode_ids: continue video_encode_ids.append(video_encode_id) + + video_file = xpath_text(stream_info, './file') + if not video_file: + continue + if video_file.startswith('http'): + formats.extend(self._extract_m3u8_formats( + video_file, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + video_url = xpath_text(stream_info, './host') - video_play_path = xpath_text(stream_info, './file') - if not video_url or not video_play_path: + if not video_url: continue metadata = stream_info.find('./metadata') format_info = { @@ -353,7 +432,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text parsed_video_url = compat_urlparse.urlparse(video_url) direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1]))) + path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) if self._is_valid_url(direct_video_url, video_id, video_format): format_info.update({ 'url': direct_video_url, @@ -363,7 +442,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text format_info.update({ 'url': video_url, - 'play_path': video_play_path, + 'play_path': video_file, 'ext': 'flv', }) formats.append(format_info) @@ -378,6 +457,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text subtitles = self.extract_subtitles(video_id, webpage) + # webpage provide more accurate data than series_title from XML + series = self._html_search_regex( + r'id=["\']showmedia_about_episode_num[^>]+>\s*<a[^>]+>([^<]+)', + webpage, 'series', default=xpath_text(metadata, 'series_title')) + + episode = xpath_text(metadata, 'episode_title') + episode_number = int_or_none(xpath_text(metadata, 'episode_number')) + + season_number = int_or_none(self._search_regex( + r'(?s)<h4[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h4>\s*<h4>\s*Season (\d+)', + webpage, 'season number', default=None)) + return { 'id': video_id, 'title': video_title, @@ -385,9 +476,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'thumbnail': xpath_text(metadata, 'episode_image_url'), 'uploader': video_uploader, 'upload_date': video_upload_date, - 'series': xpath_text(metadata, 'series_title'), - 'episode': xpath_text(metadata, 'episode_title'), - 'episode_number': int_or_none(xpath_text(metadata, 'episode_number')), + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'subtitles': subtitles, 'formats': formats, } diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 7e5d4f227..d4576160b 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, ) from .senateisvp import SenateISVPIE +from .ustream import UstreamIE class CSpanIE(InfoExtractor): @@ -22,14 +23,13 @@ class CSpanIE(InfoExtractor): 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', - 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, + 'playlist_mincount': 2, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', + # md5 is unstable 'info_dict': { 'id': 'c4486943', 'ext': 'mp4', @@ -38,14 +38,11 @@ class CSpanIE(InfoExtractor): } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', - 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', - 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', - 'duration': 14848, - 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, + 'playlist_mincount': 6, }, { # Video from senate.gov 'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers', @@ -57,12 +54,30 @@ class CSpanIE(InfoExtractor): 'params': { 'skip_download': True, # m3u8 downloads } + }, { + # Ustream embedded video + 'url': 'https://www.c-span.org/video/?114917-1/armed-services', + 'info_dict': { + 'id': '58428542', + 'ext': 'flv', + 'title': 'USHR07 Armed Services Committee', + 'description': 'hsas00-2118-20150204-1000et-07\n\n\nUSHR07 Armed Services Committee', + 'timestamp': 1423060374, + 'upload_date': '20150204', + 'uploader': 'HouseCommittee', + 'uploader_id': '12987475', + }, }] def _real_extract(self, url): video_id = self._match_id(url) video_type = None webpage = self._download_webpage(url, video_id) + + ustream_url = UstreamIE._extract_url(webpage) + if ustream_url: + return self.url_result(ustream_url, UstreamIE.ie_key()) + # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py index 1622fc844..d565335cf 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/youtube_dl/extractor/ctsnews.py @@ -1,13 +1,12 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import parse_iso8601, ExtractorError +from ..utils import unified_timestamp class CtsNewsIE(InfoExtractor): IE_DESC = '華視新聞' - # https connection failed (Connection reset) _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html' _TESTS = [{ 'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html', @@ -16,7 +15,7 @@ class CtsNewsIE(InfoExtractor): 'id': '201501291578109', 'ext': 'mp4', 'title': '以色列.真主黨交火 3人死亡', - 'description': 'md5:95e9b295c898b7ff294f09d450178d7d', + 'description': '以色列和黎巴嫩真主黨,爆發五年最嚴重衝突,雙方砲轟交火,兩名以軍死亡,還有一名西班牙籍的聯合國維和人...', 'timestamp': 1422528540, 'upload_date': '20150129', } @@ -28,61 +27,55 @@ class CtsNewsIE(InfoExtractor): 'id': '201309031304098', 'ext': 'mp4', 'title': '韓國31歲童顏男 貌如十多歲小孩', - 'description': 'md5:f183feeba3752b683827aab71adad584', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': '越有年紀的人,越希望看起來年輕一點,而南韓卻有一位31歲的男子,看起來像是11、12歲的小孩,身...', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1378205880, 'upload_date': '20130903', } }, { # With Youtube embedded video 'url': 'http://news.cts.com.tw/cts/money/201501/201501291578003.html', - 'md5': '1d842c771dc94c8c3bca5af2cc1db9c5', - 'add_ie': ['Youtube'], + 'md5': 'e4726b2ccd70ba2c319865e28f0a91d1', 'info_dict': { 'id': 'OVbfO7d0_hQ', 'ext': 'mp4', 'title': 'iPhone6熱銷 蘋果財報亮眼', 'description': 'md5:f395d4f485487bb0f992ed2c4b07aa7d', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150128', 'uploader_id': 'TBSCTS', 'uploader': '中華電視公司', - } + }, + 'add_ie': ['Youtube'], }] def _real_extract(self, url): news_id = self._match_id(url) page = self._download_webpage(url, news_id) - if self._search_regex(r'(CTSPlayer2)', page, 'CTSPlayer2 identifier', default=None): - feed_url = self._html_search_regex( - r'(http://news\.cts\.com\.tw/action/mp4feed\.php\?news_id=\d+)', - page, 'feed url') - video_url = self._download_webpage( - feed_url, news_id, note='Fetching feed') + news_id = self._hidden_inputs(page).get('get_id') + + if news_id: + mp4_feed = self._download_json( + 'http://news.cts.com.tw/action/test_mp4feed.php', + news_id, note='Fetching feed', query={'news_id': news_id}) + video_url = mp4_feed['source_url'] else: self.to_screen('Not CTSPlayer video, trying Youtube...') youtube_url = self._search_regex( - r'src="(//www\.youtube\.com/embed/[^"]+)"', page, 'youtube url', - default=None) - if not youtube_url: - raise ExtractorError('The news includes no videos!', expected=True) + r'src="(//www\.youtube\.com/embed/[^"]+)"', page, 'youtube url') - return { - '_type': 'url', - 'url': youtube_url, - 'ie_key': 'Youtube', - } + return self.url_result(youtube_url, ie='Youtube') description = self._html_search_meta('description', page) - title = self._html_search_meta('title', page) + title = self._html_search_meta('title', page, fatal=True) thumbnail = self._html_search_meta('image', page) datetime_str = self._html_search_regex( - r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page, 'date and time') - # Transform into ISO 8601 format with timezone info - datetime_str = datetime_str.replace('/', '-') + ':00+0800' - timestamp = parse_iso8601(datetime_str, delimiter=' ') + r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page, 'date and time', fatal=False) + timestamp = None + if datetime_str: + timestamp = unified_timestamp(datetime_str) - 8 * 3600 return { 'id': news_id, diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py deleted file mode 100644 index 5807fbac9..000000000 --- a/youtube_dl/extractor/ctv.py +++ /dev/null @@ -1,30 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class CTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)' - _TESTS = [{ - 'url': 'http://www.ctv.ca/video/player?vid=706966', - 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', - 'info_dict': { - 'id': '706966', - 'ext': 'mp4', - 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', - 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', - 'upload_date': '20150919', - 'timestamp': 1442624700, - }, - 'expected_warnings': ['HTTP Error 404'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': '9c9media:ctv_web:%s' % video_id, - 'ie_key': 'NineCNineMedia', - } diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py index 1023b6130..55a127b76 100644 --- a/youtube_dl/extractor/ctvnews.py +++ b/youtube_dl/extractor/ctvnews.py @@ -8,7 +8,7 @@ from ..utils import orderedSet class CTVNewsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)' + _VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)' _TESTS = [{ 'url': 'http://www.ctvnews.ca/video?clipId=901995', 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', @@ -40,6 +40,9 @@ class CTVNewsIE(InfoExtractor): }, { 'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231', 'only_matching': True, + }, { + 'url': 'http://vancouverisland.ctvnews.ca/video?clipId=761241', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/cultureunplugged.py b/youtube_dl/extractor/cultureunplugged.py index 9c764fe68..bcdf27323 100644 --- a/youtube_dl/extractor/cultureunplugged.py +++ b/youtube_dl/extractor/cultureunplugged.py @@ -1,9 +1,13 @@ from __future__ import unicode_literals import re +import time from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + HEADRequest, +) class CultureUnpluggedIE(InfoExtractor): @@ -17,7 +21,7 @@ class CultureUnpluggedIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Next, Best West', 'description': 'md5:0423cd00833dea1519cf014e9d0903b1', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'creator': 'Coldstream Creative', 'duration': 2203, 'view_count': int, @@ -32,6 +36,9 @@ class CultureUnpluggedIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id + # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request + self._request_webpage(HEADRequest( + 'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id) movie_data = self._download_json( 'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id) diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py new file mode 100644 index 000000000..e3c99468c --- /dev/null +++ b/youtube_dl/extractor/curiositystream.py @@ -0,0 +1,120 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + urlencode_postdata, + compat_str, + ExtractorError, +) + + +class CuriosityStreamBaseIE(InfoExtractor): + _NETRC_MACHINE = 'curiositystream' + _auth_token = None + _API_BASE_URL = 'https://api.curiositystream.com/v1/' + + def _handle_errors(self, result): + error = result.get('error', {}).get('message') + if error: + if isinstance(error, dict): + error = ', '.join(error.values()) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + + def _call_api(self, path, video_id): + headers = {} + if self._auth_token: + headers['X-Auth-Token'] = self._auth_token + result = self._download_json( + self._API_BASE_URL + path, video_id, headers=headers) + self._handle_errors(result) + return result['data'] + + def _real_initialize(self): + (email, password) = self._get_login_info() + if email is None: + return + result = self._download_json( + self._API_BASE_URL + 'login', None, data=urlencode_postdata({ + 'email': email, + 'password': password, + })) + self._handle_errors(result) + self._auth_token = result['message']['auth_token'] + + def _extract_media_info(self, media): + video_id = compat_str(media['id']) + limelight_media_id = media['limelight_media_id'] + title = media['title'] + + subtitles = {} + for closed_caption in media.get('closed_captions', []): + sub_url = closed_caption.get('file') + if not sub_url: + continue + lang = closed_caption.get('code') or closed_caption.get('language') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:' + limelight_media_id, + 'title': title, + 'description': media.get('description'), + 'thumbnail': media.get('image_large') or media.get('image_medium') or media.get('image_small'), + 'duration': int_or_none(media.get('duration')), + 'tags': media.get('tags'), + 'subtitles': subtitles, + 'ie_key': 'LimelightMedia', + } + + +class CuriosityStreamIE(CuriosityStreamBaseIE): + IE_NAME = 'curiositystream' + _VALID_URL = r'https?://app\.curiositystream\.com/video/(?P<id>\d+)' + _TEST = { + 'url': 'https://app.curiositystream.com/video/2', + 'md5': 'a0074c190e6cddaf86900b28d3e9ee7a', + 'info_dict': { + 'id': '2', + 'ext': 'mp4', + 'title': 'How Did You Develop The Internet?', + 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + 'timestamp': 1448388615, + 'upload_date': '20151124', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media = self._call_api('media/' + video_id, video_id) + return self._extract_media_info(media) + + +class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): + IE_NAME = 'curiositystream:collection' + _VALID_URL = r'https?://app\.curiositystream\.com/collection/(?P<id>\d+)' + _TEST = { + 'url': 'https://app.curiositystream.com/collection/2', + 'info_dict': { + 'id': '2', + 'title': 'Curious Minds: The Internet', + 'description': 'How is the internet shaping our lives in the 21st Century?', + }, + 'playlist_mincount': 17, + } + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._call_api( + 'collections/' + collection_id, collection_id) + entries = [] + for media in collection.get('media', []): + entries.append(self._extract_media_info(media)) + return self.playlist_result( + entries, collection_id, + collection.get('title'), collection.get('description')) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index c66c359cf..1ab9333b2 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -28,7 +28,8 @@ class CWTVIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'skip': 'redirect to http://cwtv.com/shows/arrow/', }, { 'url': 'http://www.cwseed.com/shows/whose-line-is-it-anyway/jeff-davis-4/?play=24282b12-ead2-42f2-95ad-26770c2c6088', 'info_dict': { @@ -44,10 +45,6 @@ class CWTVIE(InfoExtractor): 'upload_date': '20151006', 'timestamp': 1444107300, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', 'only_matching': True, @@ -61,11 +58,30 @@ class CWTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/132?format=json' % video_id, video_id) - - formats = self._extract_m3u8_formats( - video_data['videos']['variantplaylist']['uri'], video_id, 'mp4') + video_data = None + formats = [] + for partner in (154, 213): + vdata = self._download_json( + 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/%d?format=json' % (video_id, partner), video_id, fatal=False) + if not vdata: + continue + video_data = vdata + for quality, quality_data in vdata.get('videos', {}).items(): + quality_url = quality_data.get('uri') + if not quality_url: + continue + if quality == 'variantplaylist': + formats.extend(self._extract_m3u8_formats( + quality_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + tbr = int_or_none(quality_data.get('bitrate')) + format_id = 'http' + ('-%d' % tbr if tbr else '') + if self._is_valid_url(quality_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': quality_url, + 'tbr': tbr, + }) self._sort_formats(formats) thumbnails = [{ diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 496883d15..31bf5faf6 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -58,7 +58,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'ext': 'mp4', 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', 'description': 'Several come bundled with the Steam Controller.', - 'thumbnail': 're:^https?:.*\.(?:jpg|png)$', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', 'duration': 74, 'timestamp': 1425657362, 'upload_date': '20150306', @@ -94,7 +94,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', 'uploader': 'HotWaves1012', 'age_limit': 18, - } + }, + 'skip': 'video gone', }, # geo-restricted, player v5 { @@ -144,7 +145,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): player_v5 = self._search_regex( [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826 r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', - r'buildPlayer\(({.+?})\);'], + r'buildPlayer\(({.+?})\);', + r'var\s+config\s*=\s*({.+?});'], webpage, 'player v5', default=None) if player_v5: player = self._parse_json(player_v5, video_id) @@ -394,7 +396,7 @@ class DailymotionUserIE(DailymotionPlaylistIE): class DailymotionCloudIE(DailymotionBaseInfoExtractor): - _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/' + _VALID_URL_PREFIX = r'https?://api\.dmcloud\.net/(?:player/)?embed/' _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index b5c310ccb..76f021892 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals @@ -32,7 +32,7 @@ class DaumIE(InfoExtractor): 'title': '마크 헌트 vs 안토니오 실바', 'description': 'Mark Hunt vs Antonio Silva', 'upload_date': '20131217', - 'thumbnail': 're:^https?://.*\.(?:jpg|png)', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'duration': 2117, 'view_count': int, 'comment_count': int, @@ -45,7 +45,7 @@ class DaumIE(InfoExtractor): 'title': '1297회, \'아빠 아들로 태어나길 잘 했어\' 민수, 감동의 눈물[아빠 어디가] 20150118', 'description': 'md5:79794514261164ff27e36a21ad229fc5', 'upload_date': '20150604', - 'thumbnail': 're:^https?://.*\.(?:jpg|png)', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'duration': 154, 'view_count': int, 'comment_count': int, @@ -61,7 +61,7 @@ class DaumIE(InfoExtractor): 'title': '01-Korean War ( Trouble on the horizon )', 'description': '\nKorean War 01\nTrouble on the horizon\n전쟁의 먹구름', 'upload_date': '20080223', - 'thumbnail': 're:^https?://.*\.(?:jpg|png)', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'duration': 249, 'view_count': int, 'comment_count': int, @@ -139,7 +139,7 @@ class DaumClipIE(InfoExtractor): 'title': 'DOTA 2GETHER 시즌2 6회 - 2부', 'description': 'DOTA 2GETHER 시즌2 6회 - 2부', 'upload_date': '20130831', - 'thumbnail': 're:^https?://.*\.(?:jpg|png)', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'duration': 3868, 'view_count': int, }, diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py index caff8842e..f232f0dc5 100644 --- a/youtube_dl/extractor/dbtv.py +++ b/youtube_dl/extractor/dbtv.py @@ -17,7 +17,7 @@ class DBTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen', 'description': 'md5:1504a54606c4dde3e4e61fc97aa857e0', - 'thumbnail': 're:https?://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'timestamp': 1404039863, 'upload_date': '20140629', 'duration': 69.544, @@ -38,6 +38,12 @@ class DBTVIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dbtv\.no/(?:lazy)?player/\d+.*?)\1', + webpage)] + def _real_extract(self, url): video_id, display_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 9099f5046..00fbbff2f 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -1,61 +1,54 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..utils import unified_strdate class DctpTvIE(InfoExtractor): - _VALID_URL = r'https?://www.dctp.tv/(#/)?filme/(?P<id>.+?)/$' + _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(#/)?filme/(?P<id>.+?)/$' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', + 'md5': '174dd4a8a6225cf5655952f969cfbe24', 'info_dict': { - 'id': '1324', + 'id': '95eaa4f33dad413aa17b4ee613cccc6c', 'display_id': 'videoinstallation-fuer-eine-kaufhausfassade', - 'ext': 'flv', - 'title': 'Videoinstallation für eine Kaufhausfassade' + 'ext': 'mp4', + 'title': 'Videoinstallation für eine Kaufhausfassade', + 'description': 'Kurzfilm', + 'upload_date': '20110407', + 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - # rtmp download - 'skip_download': True, - } } def _real_extract(self, url): video_id = self._match_id(url) - base_url = 'http://dctp-ivms2-restapi.s3.amazonaws.com/' - version_json = self._download_json( - base_url + 'version.json', - video_id, note='Determining file version') - version = version_json['version_name'] - info_json = self._download_json( - '{0}{1}/restapi/slugs/{2}.json'.format(base_url, version, video_id), - video_id, note='Fetching object ID') - object_id = compat_str(info_json['object_id']) - meta_json = self._download_json( - '{0}{1}/restapi/media/{2}.json'.format(base_url, version, object_id), - video_id, note='Downloading metadata') - uuid = meta_json['uuid'] - title = meta_json['title'] - wide = meta_json['is_wide'] - if wide: - ratio = '16x9' - else: - ratio = '4x3' - play_path = 'mp4:{0}_dctp_0500_{1}.m4v'.format(uuid, ratio) + webpage = self._download_webpage(url, video_id) + + object_id = self._html_search_meta('DC.identifier', webpage) servers_json = self._download_json( - 'http://www.dctp.tv/streaming_servers/', + 'http://www.dctp.tv/elastic_streaming_client/get_streaming_server/', video_id, note='Downloading server list') - url = servers_json[0]['endpoint'] + server = servers_json[0]['server'] + m3u8_path = self._search_regex( + r'\'([^\'"]+/playlist\.m3u8)"', webpage, 'm3u8 path') + formats = self._extract_m3u8_formats( + 'http://%s%s' % (server, m3u8_path), video_id, ext='mp4', + entry_protocol='m3u8_native') + + title = self._og_search_title(webpage) + description = self._html_search_meta('DC.description', webpage) + upload_date = unified_strdate( + self._html_search_meta('DC.date.created', webpage)) + thumbnail = self._og_search_thumbnail(webpage) return { 'id': object_id, 'title': title, - 'format': 'rtmp', - 'url': url, - 'play_path': play_path, - 'rtmp_real_time': True, - 'ext': 'flv', - 'display_id': video_id + 'formats': formats, + 'display_id': video_id, + 'description': description, + 'upload_date': upload_date, + 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/deezer.py b/youtube_dl/extractor/deezer.py index 7a07f3267..ec87b94db 100644 --- a/youtube_dl/extractor/deezer.py +++ b/youtube_dl/extractor/deezer.py @@ -19,7 +19,7 @@ class DeezerPlaylistIE(InfoExtractor): 'id': '176747451', 'title': 'Best!', 'uploader': 'Anonymous', - 'thumbnail': 're:^https?://cdn-images.deezer.com/images/cover/.*\.jpg$', + 'thumbnail': r're:^https?://cdn-images.deezer.com/images/cover/.*\.jpg$', }, 'playlist_count': 30, 'skip': 'Only available in .de', diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 65a98d789..bdfe638b4 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -13,7 +13,7 @@ from ..utils import ( class DemocracynowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?democracynow.org/(?P<id>[^\?]*)' + _VALID_URL = r'https?://(?:www\.)?democracynow\.org/(?P<id>[^\?]*)' IE_NAME = 'democracynow' _TESTS = [{ 'url': 'http://www.democracynow.org/shows/2015/7/3', diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py index 44e0c5d4d..aee72a6ed 100644 --- a/youtube_dl/extractor/dhm.py +++ b/youtube_dl/extractor/dhm.py @@ -17,7 +17,7 @@ class DHMIE(InfoExtractor): 'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE', 'description': 'md5:1fabd480c153f97b07add61c44407c82', 'duration': 660, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { 'url': 'http://www.dhm.de/filmarchiv/02-mapping-the-wall/peter-g/rolle-1/', @@ -26,7 +26,7 @@ class DHMIE(InfoExtractor): 'id': 'rolle-1', 'ext': 'flv', 'title': 'ROLLE 1', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }] diff --git a/youtube_dl/extractor/digiteka.py b/youtube_dl/extractor/digiteka.py index 7bb79ffda..3dfde0d8c 100644 --- a/youtube_dl/extractor/digiteka.py +++ b/youtube_dl/extractor/digiteka.py @@ -36,7 +36,7 @@ class DigitekaIE(InfoExtractor): 'id': 's8uk0r', 'ext': 'mp4', 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 74, 'upload_date': '20150317', 'timestamp': 1426604939, @@ -50,7 +50,7 @@ class DigitekaIE(InfoExtractor): 'id': 'xvpfp8', 'ext': 'mp4', 'title': 'Two - C\'est La Vie (clip)', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 233, 'upload_date': '20150224', 'timestamp': 1424760500, diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py new file mode 100644 index 000000000..2042493a8 --- /dev/null +++ b/youtube_dl/extractor/discoverygo.py @@ -0,0 +1,115 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + extract_attributes, + int_or_none, + parse_age_limit, + ExtractorError, +) + + +class DiscoveryGoIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: + discovery| + investigationdiscovery| + discoverylife| + animalplanet| + ahctv| + destinationamerica| + sciencechannel| + tlc| + velocitychannel + )go\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)''' + _TEST = { + 'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/', + 'info_dict': { + 'id': '57a33c536b66d1cd0345eeb1', + 'ext': 'mp4', + 'title': 'Kiss First, Ask Questions Later!', + 'description': 'md5:fe923ba34050eae468bffae10831cb22', + 'duration': 2579, + 'series': 'Love at First Kiss', + 'season_number': 1, + 'episode_number': 1, + 'age_limit': 14, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + container = extract_attributes( + self._search_regex( + r'(<div[^>]+class=["\']video-player-container[^>]+>)', + webpage, 'video container')) + + video = self._parse_json( + container.get('data-video') or container.get('data-json'), + display_id) + + title = video['name'] + + stream = video.get('stream') + if not stream: + if video.get('authenticated') is True: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.', expected=True) + else: + raise ExtractorError('Unable to find stream') + STREAM_URL_SUFFIX = 'streamUrl' + formats = [] + for stream_kind in ('', 'hds'): + suffix = STREAM_URL_SUFFIX.capitalize() if stream_kind else STREAM_URL_SUFFIX + stream_url = stream.get('%s%s' % (stream_kind, suffix)) + if not stream_url: + continue + if stream_kind == '': + formats.extend(self._extract_m3u8_formats( + stream_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif stream_kind == 'hds': + formats.extend(self._extract_f4m_formats( + stream_url, display_id, f4m_id=stream_kind, fatal=False)) + self._sort_formats(formats) + + video_id = video.get('id') or display_id + description = video.get('description', {}).get('detailed') + duration = int_or_none(video.get('duration')) + + series = video.get('show', {}).get('name') + season_number = int_or_none(video.get('season', {}).get('number')) + episode_number = int_or_none(video.get('episodeNumber')) + + tags = video.get('tags') + age_limit = parse_age_limit(video.get('parental', {}).get('rating')) + + subtitles = {} + captions = stream.get('captions') + if isinstance(captions, list): + for caption in captions: + subtitle_url = caption.get('fileUrl') + if (not subtitle_url or not isinstance(subtitle_url, compat_str) or + not subtitle_url.startswith('http')): + continue + lang = caption.get('fileLang', 'en') + subtitles.setdefault(lang, []).append({'url': subtitle_url}) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, + 'tags': tags, + 'age_limit': age_limit, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/disney.py b/youtube_dl/extractor/disney.py new file mode 100644 index 000000000..396873c6d --- /dev/null +++ b/youtube_dl/extractor/disney.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + compat_str, + determine_ext, +) + + +class DisneyIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?P<domain>(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr)|starwars\.com))/(?:embed/|(?:[^/]+/)+[\w-]+-)(?P<id>[a-z0-9]{24})''' + _TESTS = [{ + 'url': 'http://video.disney.com/watch/moana-trailer-545ed1857afee5a0ec239977', + 'info_dict': { + 'id': '545ed1857afee5a0ec239977', + 'ext': 'mp4', + 'title': 'Moana - Trailer', + 'description': 'A fun adventure for the entire Family! Bring home Moana on Digital HD Feb 21 & Blu-ray March 7', + 'upload_date': '20170112', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://videos.disneylatino.com/ver/spider-man-de-regreso-a-casa-primer-adelanto-543a33a1850bdcfcca13bae2', + 'only_matching': True, + }, { + 'url': 'http://video.en.disneyme.com/watch/future-worm/robo-carp-2001-544b66002aa7353cdd3f5114', + 'only_matching': True, + }, { + 'url': 'http://video.disneyturkiye.com.tr/izle/7c-7-cuceler/kimin-sesi-zaten-5456f3d015f6b36c8afdd0e2', + 'only_matching': True, + }, { + 'url': 'http://disneyjunior.disney.com/embed/546a4798ddba3d1612e4005d', + 'only_matching': True, + }, { + 'url': 'http://www.starwars.com/embed/54690d1e6c42e5f09a0fb097', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage( + 'http://%s/embed/%s' % (domain, video_id), video_id) + video_data = self._parse_json(self._search_regex( + r'Disney\.EmbedVideo=({.+});', webpage, 'embed data'), video_id)['video'] + + for external in video_data.get('externals', []): + if external.get('source') == 'vevo': + return self.url_result('vevo:' + external['data_id'], 'Vevo') + + title = video_data['title'] + + formats = [] + for flavor in video_data.get('flavors', []): + flavor_format = flavor.get('format') + flavor_url = flavor.get('url') + if not flavor_url or not re.match(r'https?://', flavor_url): + continue + tbr = int_or_none(flavor.get('bitrate')) + if tbr == 99999: + formats.extend(self._extract_m3u8_formats( + flavor_url, video_id, 'mp4', m3u8_id=flavor_format, fatal=False)) + continue + format_id = [] + if flavor_format: + format_id.append(flavor_format) + if tbr: + format_id.append(compat_str(tbr)) + ext = determine_ext(flavor_url) + if flavor_format == 'applehttp' or ext == 'm3u8': + ext = 'mp4' + width = int_or_none(flavor.get('width')) + height = int_or_none(flavor.get('height')) + formats.append({ + 'format_id': '-'.join(format_id), + 'url': flavor_url, + 'width': width, + 'height': height, + 'tbr': tbr, + 'ext': ext, + 'vcodec': 'none' if (width == 0 and height == 0) else None, + }) + self._sort_formats(formats) + + subtitles = {} + for caption in video_data.get('captions', []): + caption_url = caption.get('url') + caption_format = caption.get('format') + if not caption_url or caption_format.startswith('unknown'): + continue + subtitles.setdefault(caption.get('language', 'en'), []).append({ + 'url': caption_url, + 'ext': { + 'webvtt': 'vtt', + }.get(caption_format, caption_format), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description') or video_data.get('short_desc'), + 'thumbnail': video_data.get('thumb') or video_data.get('thumb_secure'), + 'duration': int_or_none(video_data.get('duration_sec')), + 'upload_date': unified_strdate(video_data.get('publish_date')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py index e9ca236d4..1f75352ca 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/youtube_dl/extractor/dotsub.py @@ -9,22 +9,39 @@ from ..utils import ( class DotsubIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P<id>[^/]+)' - _TEST = { - 'url': 'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27', - 'md5': '0914d4d69605090f623b7ac329fea66e', + _TESTS = [{ + 'url': 'https://dotsub.com/view/9c63db2a-fa95-4838-8e6e-13deafe47f09', + 'md5': '21c7ff600f545358134fea762a6d42b6', 'info_dict': { - 'id': 'aed3b8b2-1889-4df5-ae63-ad85f5572f27', + 'id': '9c63db2a-fa95-4838-8e6e-13deafe47f09', 'ext': 'flv', - 'title': 'Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary', - 'description': 'md5:699a0f7f50aeec6042cb3b1db2d0d074', - 'thumbnail': 're:^https?://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p', - 'duration': 3169, - 'uploader': '4v4l0n42', - 'timestamp': 1292248482.625, - 'upload_date': '20101213', + 'title': 'MOTIVATION - "It\'s Possible" Best Inspirational Video Ever', + 'description': 'md5:41af1e273edbbdfe4e216a78b9d34ac6', + 'thumbnail': 're:^https?://dotsub.com/media/9c63db2a-fa95-4838-8e6e-13deafe47f09/p', + 'duration': 198, + 'uploader': 'liuxt', + 'timestamp': 1385778501.104, + 'upload_date': '20131130', 'view_count': int, } - } + }, { + 'url': 'https://dotsub.com/view/747bcf58-bd59-45b7-8c8c-ac312d084ee6', + 'md5': '2bb4a83896434d5c26be868c609429a3', + 'info_dict': { + 'id': '168006778', + 'ext': 'mp4', + 'title': 'Apartments and flats in Raipur the white symphony', + 'description': 'md5:784d0639e6b7d1bc29530878508e38fe', + 'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p', + 'duration': 290, + 'timestamp': 1476767794.2809999, + 'upload_date': '20160525', + 'uploader': 'parthivi001', + 'uploader_id': 'user52596202', + 'view_count': int, + }, + 'add_ie': ['Vimeo'], + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -37,12 +54,23 @@ class DotsubIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_url = self._search_regex( [r'<source[^>]+src="([^"]+)"', r'"file"\s*:\s*\'([^\']+)'], - webpage, 'video url') + webpage, 'video url', default=None) + info_dict = { + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + } - return { - 'id': video_id, - 'url': video_url, - 'ext': 'flv', + if not video_url: + setup_data = self._parse_json(self._html_search_regex( + r'(?s)data-setup=([\'"])(?P<content>(?!\1).+?)\1', + webpage, 'setup data', group='content'), video_id) + info_dict = { + '_type': 'url_transparent', + 'url': setup_data['src'], + } + + info_dict.update({ 'title': info['title'], 'description': info.get('description'), 'thumbnail': info.get('screenshotURI'), @@ -50,4 +78,6 @@ class DotsubIE(InfoExtractor): 'uploader': info.get('user'), 'timestamp': float_or_none(info.get('dateCreated'), 1000), 'view_count': int_or_none(info.get('numberOfViews')), - } + }) + + return info_dict diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index ce6962755..911594413 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -3,14 +3,22 @@ from __future__ import unicode_literals import hashlib import time +import uuid + from .common import InfoExtractor -from ..utils import (ExtractorError, unescapeHTML) -from ..compat import (compat_str, compat_basestring) +from ..compat import ( + compat_str, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + unescapeHTML, +) class DouyuTVIE(InfoExtractor): IE_DESC = '斗鱼' - _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?P<id>[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?:[^/]+/)*(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { @@ -18,10 +26,9 @@ class DouyuTVIE(InfoExtractor): 'display_id': 'iseven', 'ext': 'flv', 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 're:.*m7show@163\.com.*', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': r're:.*m7show@163\.com.*', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', - 'uploader_id': '431925', 'is_live': True, }, 'params': { @@ -35,9 +42,8 @@ class DouyuTVIE(InfoExtractor): 'ext': 'flv', 'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'md5:746a2f7a253966a06755a912f0acc0d2', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'douyu小漠', - 'uploader_id': '3769985', 'is_live': True, }, 'params': { @@ -51,10 +57,9 @@ class DouyuTVIE(InfoExtractor): 'display_id': '17732', 'ext': 'flv', 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 're:.*m7show@163\.com.*', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': r're:.*m7show@163\.com.*', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', - 'uploader_id': '431925', 'is_live': True, }, 'params': { @@ -63,8 +68,16 @@ class DouyuTVIE(InfoExtractor): }, { 'url': 'http://www.douyu.com/xiaocang', 'only_matching': True, + }, { + # \"room_id\" + 'url': 'http://www.douyu.com/t/lpl', + 'only_matching': True, }] + # Decompile core.swf in webpage by ffdec "Search SWFs in memory". core.swf + # is encrypted originally, but ffdec can dump memory to get the decrypted one. + _API_KEY = 'A12Svb&%1UUmf@hC' + def _real_extract(self, url): video_id = self._match_id(url) @@ -73,76 +86,58 @@ class DouyuTVIE(InfoExtractor): else: page = self._download_webpage(url, video_id) room_id = self._html_search_regex( - r'"room_id"\s*:\s*(\d+),', page, 'room id') + r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') - config = None - # Douyu API sometimes returns error "Unable to load the requested class: eticket_redis_cache" - # Retry with different parameters - same parameters cause same errors - for i in range(5): - prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( - room_id, int(time.time())) - auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest() + room = self._download_json( + 'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id, + note='Downloading room info')['data'] - config_page = self._download_webpage( - 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), - video_id) - try: - config = self._parse_json(config_page, video_id, fatal=False) - except ExtractorError: - # Wait some time before retrying to get a different time() value - self._sleep(1, video_id, msg_template='%(video_id)s: Error occurs. ' - 'Waiting for %(timeout)s seconds before retrying') - continue - else: - break - if config is None: - raise ExtractorError('Unable to fetch API result') - - data = config['data'] - - error_code = config.get('error', 0) - if error_code is not 0: - error_desc = 'Server reported error %i' % error_code - if isinstance(data, (compat_str, compat_basestring)): - error_desc += ': ' + data - raise ExtractorError(error_desc, expected=True) - - show_status = data.get('show_status') # 1 = live, 2 = offline - if show_status == '2': + if room.get('show_status') == '2': + raise ExtractorError('Live stream is offline', expected=True) + + tt = compat_str(int(time.time() / 60)) + did = uuid.uuid4().hex.upper() + + sign_content = ''.join((room_id, did, self._API_KEY, tt)) + sign = hashlib.md5((sign_content).encode('utf-8')).hexdigest() + + flv_data = compat_urllib_parse_urlencode({ + 'cdn': 'ws', + 'rate': '0', + 'tt': tt, + 'did': did, + 'sign': sign, + }) + + video_info = self._download_json( + 'http://www.douyu.com/lapi/live/getPlay/%s' % room_id, video_id, + data=flv_data, note='Downloading video info', + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + error_code = video_info.get('error', 0) + if error_code is not 0: raise ExtractorError( - 'Live stream is offline', expected=True) + '%s reported error %i' % (self.IE_NAME, error_code), + expected=True) - base_url = data['rtmp_url'] - live_path = data['rtmp_live'] + base_url = video_info['data']['rtmp_url'] + live_path = video_info['data']['rtmp_live'] - title = self._live_title(unescapeHTML(data['room_name'])) - description = data.get('show_details') - thumbnail = data.get('room_src') + video_url = '%s/%s' % (base_url, live_path) - uploader = data.get('nickname') - uploader_id = data.get('owner_uid') - - multi_formats = data.get('rtmp_multi_bitrate') - if not isinstance(multi_formats, dict): - multi_formats = {} - multi_formats['live'] = live_path - - formats = [{ - 'url': '%s/%s' % (base_url, format_path), - 'format_id': format_id, - 'preference': 1 if format_id == 'live' else 0, - } for format_id, format_path in multi_formats.items()] - self._sort_formats(formats) + title = self._live_title(unescapeHTML(room['room_name'])) + description = room.get('notice') + thumbnail = room.get('room_src') + uploader = room.get('nickname') return { 'id': room_id, 'display_id': video_id, + 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, - 'uploader_id': uploader_id, - 'formats': formats, 'is_live': True, } diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 5790553f3..32028bc3b 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -8,6 +8,7 @@ import time from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( + USER_AGENTS, int_or_none, update_url_query, ) @@ -102,10 +103,16 @@ class DPlayIE(InfoExtractor): manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False) # Sometimes final URLs inside m3u8 are unsigned, let's fix this - # ourselves + # ourselves. Also fragments' URLs are only served signed for + # Safari user agent. query = compat_urlparse.parse_qs(compat_urlparse.urlparse(manifest_url).query) for m3u8_format in m3u8_formats: - m3u8_format['url'] = update_url_query(m3u8_format['url'], query) + m3u8_format.update({ + 'url': update_url_query(m3u8_format['url'], query), + 'http_headers': { + 'User-Agent': USER_AGENTS['Safari'], + }, + }) formats.extend(m3u8_formats) elif protocol == 'hds': formats.extend(self._extract_f4m_formats( diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 3b6529f4b..bcd9fe2a0 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import itertools @@ -66,7 +66,7 @@ class DramaFeverBaseIE(AMPIE): class DramaFeverIE(DramaFeverBaseIE): IE_NAME = 'dramafever' - _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/(?:[^/]+/)?drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)' _TESTS = [{ 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', 'info_dict': { @@ -76,7 +76,7 @@ class DramaFeverIE(DramaFeverBaseIE): 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', 'episode': 'Episode 1', 'episode_number': 1, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1404336058, 'upload_date': '20140702', 'duration': 343, @@ -94,7 +94,7 @@ class DramaFeverIE(DramaFeverBaseIE): 'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91', 'episode': 'Mnet Asian Music Awards 2015 - Part 3', 'episode_number': 4, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1450213200, 'upload_date': '20151215', 'duration': 5602, @@ -103,6 +103,9 @@ class DramaFeverIE(DramaFeverBaseIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://www.dramafever.com/zh-cn/drama/4972/15/Doctor_Romantic/', + 'only_matching': True, }] def _real_extract(self, url): @@ -148,7 +151,7 @@ class DramaFeverIE(DramaFeverBaseIE): class DramaFeverSeriesIE(DramaFeverBaseIE): IE_NAME = 'dramafever:series' - _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/(?:[^/]+/)?drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' _TESTS = [{ 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/', 'info_dict': { diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py index 01271f8f0..79ec212c8 100644 --- a/youtube_dl/extractor/drbonanza.py +++ b/youtube_dl/extractor/drbonanza.py @@ -20,7 +20,7 @@ class DRBonanzaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Talkshowet - Leonard Cohen', 'description': 'md5:8f34194fb30cd8c8a30ad8b27b70c0ca', - 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', 'timestamp': 1295537932, 'upload_date': '20110120', 'duration': 3664, @@ -36,7 +36,7 @@ class DRBonanzaIE(InfoExtractor): 'ext': 'mp3', 'title': 'EM fodbold 1992 Danmark - Tyskland finale Transmission', 'description': 'md5:501e5a195749480552e214fbbed16c4e', - 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', 'timestamp': 1223274900, 'upload_date': '20081006', 'duration': 7369, diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 908c9e514..f138025d5 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -2,10 +2,19 @@ from __future__ import unicode_literals import re -from .zdf import ZDFIE +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + xpath_text, + determine_ext, + qualities, + float_or_none, + ExtractorError, +) -class DreiSatIE(ZDFIE): +class DreiSatIE(InfoExtractor): IE_NAME = '3sat' _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' _TESTS = [ @@ -31,6 +40,163 @@ class DreiSatIE(ZDFIE): }, ] + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + param_groups = {} + for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): + group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace')) + params = {} + for param in param_group: + params[param.get('name')] = param.get('value') + param_groups[group_id] = params + + formats = [] + for video in smil.findall(self._xpath_ns('.//video', namespace)): + src = video.get('src') + if not src: + continue + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + group_id = video.get('paramGroup') + param_group = param_groups[group_id] + for proto in param_group['protocols'].split(','): + formats.append({ + 'url': '%s://%s' % (proto, param_group['host']), + 'app': param_group['app'], + 'play_path': src, + 'ext': 'flv', + 'format_id': '%s-%d' % (proto, bitrate), + 'tbr': bitrate, + }) + self._sort_formats(formats) + return formats + + def extract_from_xml_url(self, video_id, xml_url): + doc = self._download_xml( + xml_url, video_id, + note='Downloading video info', + errnote='Failed to download video info') + + status_code = doc.find('./status/statuscode') + if status_code is not None and status_code.text != 'ok': + code = status_code.text + if code == 'notVisibleAnymore': + message = 'Video %s is not available' % video_id + else: + message = '%s returned error: %s' % (self.IE_NAME, code) + raise ExtractorError(message, expected=True) + + title = doc.find('.//information/title').text + description = xpath_text(doc, './/information/detail', 'description') + duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) + uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') + uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') + upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) + + def xml_to_thumbnails(fnode): + thumbnails = [] + for node in fnode: + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + if 'key' in node.attrib: + m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + return thumbnails + + thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) + + format_nodes = doc.findall('.//formitaeten/formitaet') + quality = qualities(['veryhigh', 'high', 'med', 'low']) + + def get_quality(elem): + return quality(xpath_text(elem, 'quality')) + format_nodes.sort(key=get_quality) + format_ids = [] + formats = [] + for fnode in format_nodes: + video_url = fnode.find('url').text + is_available = 'http://www.metafilegenerator' not in video_url + if not is_available: + continue + format_id = fnode.attrib['basetype'] + quality = xpath_text(fnode, './quality', 'quality') + format_m = re.match(r'''(?x) + (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ + (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) + ''', format_id) + + ext = determine_ext(video_url, None) or format_m.group('container') + if ext not in ('smil', 'f4m', 'm3u8'): + format_id = format_id + '-' + quality + if format_id in format_ids: + continue + + if ext == 'meta': + continue + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + video_url, video_id, fatal=False)) + elif ext == 'm3u8': + # the certificates are misconfigured (see + # https://github.com/rg3/youtube-dl/issues/8665) + if video_url.startswith('https://'): + continue + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=format_id, fatal=False)) + else: + proto = format_m.group('proto').lower() + + abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) + + width = int_or_none(xpath_text(fnode, './width', 'width')) + height = int_or_none(xpath_text(fnode, './height', 'height')) + + filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) + + format_note = '' + if not format_note: + format_note = None + + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'width': width, + 'height': height, + 'filesize': filesize, + 'format_note': format_note, + 'protocol': proto, + '_available': is_available, + }) + format_ids.append(format_id) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'formats': formats, + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 639f9182c..1eca82b3b 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -3,12 +3,15 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + NO_DEFAULT, + str_to_int, +) class DrTuberIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?drtuber\.com/video/(?P<id>\d+)/(?P<display_id>[\w-]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?' + _TESTS = [{ 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', 'md5': '93e680cf2536ad0dfb7e74d94a89facd', 'info_dict': { @@ -17,44 +20,57 @@ class DrTuberIE(InfoExtractor): 'ext': 'mp4', 'title': 'hot perky blonde naked golf', 'like_count': int, - 'dislike_count': int, 'comment_count': int, 'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'], - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, } - } + }, { + 'url': 'http://www.drtuber.com/embed/489939', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)', + webpage) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id - webpage = self._download_webpage(url, display_id) + webpage = self._download_webpage( + 'http://www.drtuber.com/video/%s' % video_id, display_id) video_url = self._html_search_regex( r'<source src="([^"]+)"', webpage, 'video URL') title = self._html_search_regex( - [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'], + (r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', + r'<p[^>]+class="title_substrate">([^<]+)</p>', + r'<title>([^<]+) - \d+'), webpage, 'title') thumbnail = self._html_search_regex( r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) - def extract_count(id_, name): + def extract_count(id_, name, default=NO_DEFAULT): return str_to_int(self._html_search_regex( r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_, - webpage, '%s count' % name, fatal=False)) + webpage, '%s count' % name, default=default, fatal=False)) like_count = extract_count('rate_likes', 'like') - dislike_count = extract_count('rate_dislikes', 'dislike') + dislike_count = extract_count('rate_dislikes', 'dislike', default=None) comment_count = extract_count('comments_count', 'comment') cats_str = self._search_regex( - r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False) - categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str) + r'<div[^>]+class="categories_list">(.+?)</div>', + webpage, 'categories', fatal=False) + categories = [] if not cats_str else re.findall( + r'<a title="([^"]+)"', cats_str) return { 'id': video_id, diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 2d74ff855..e966d7483 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -4,26 +4,46 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, + float_or_none, + mimetype2ext, parse_iso8601, + remove_end, + update_url_query, ) class DRTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' - - _TEST = { - 'url': 'https://www.dr.dk/tv/se/boern/ultra/panisk-paske/panisk-paske-5', - 'md5': 'dc515a9ab50577fa14cc4e4b0265168f', + _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio/ondemand)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' + IE_NAME = 'drtv' + _TESTS = [{ + 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', + 'md5': '25e659cccc9a2ed956110a299fdf5983', 'info_dict': { - 'id': 'panisk-paske-5', + 'id': 'klassen-darlig-taber-10', 'ext': 'mp4', - 'title': 'Panisk Påske (5)', - 'description': 'md5:ca14173c5ab24cd26b0fcc074dff391c', - 'timestamp': 1426984612, - 'upload_date': '20150322', - 'duration': 1455, + 'title': 'Klassen - Dårlig taber (10)', + 'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa', + 'timestamp': 1471991907, + 'upload_date': '20160823', + 'duration': 606.84, }, - } + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', + 'md5': '2c37175c718155930f939ef59952474a', + 'info_dict': { + 'id': 'christiania-pusher-street-ryddes-drdkrjpo', + 'ext': 'mp4', + 'title': 'LIVE Christianias rydning af Pusher Street er i gang', + 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.', + 'timestamp': 1472800279, + 'upload_date': '20160902', + 'duration': 131.4, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -35,7 +55,8 @@ class DRTVIE(InfoExtractor): 'Video %s is not available' % video_id, expected=True) video_id = self._search_regex( - r'data-(?:material-identifier|episode-slug)="([^"]+)"', + (r'data-(?:material-identifier|episode-slug)="([^"]+)"', + r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), webpage, 'video id') programcard = self._download_json( @@ -43,9 +64,12 @@ class DRTVIE(InfoExtractor): video_id, 'Downloading video JSON') data = programcard['Data'][0] - title = data['Title'] - description = data['Description'] - timestamp = parse_iso8601(data['CreatedTime']) + title = remove_end(self._og_search_title( + webpage, default=None), ' | TV | DR') or data['Title'] + description = self._og_search_description( + webpage, default=None) or data.get('Description') + + timestamp = parse_iso8601(data.get('CreatedTime')) thumbnail = None duration = None @@ -56,28 +80,35 @@ class DRTVIE(InfoExtractor): subtitles = {} for asset in data['Assets']: - if asset['Kind'] == 'Image': - thumbnail = asset['Uri'] - elif asset['Kind'] == 'VideoResource': - duration = asset['DurationInMilliseconds'] / 1000.0 - restricted_to_denmark = asset['RestrictedToDenmark'] - spoken_subtitles = asset['Target'] == 'SpokenSubtitles' - for link in asset['Links']: - uri = link['Uri'] - target = link['Target'] - format_id = target + kind = asset.get('Kind') + if kind == 'Image': + thumbnail = asset.get('Uri') + elif kind in ('VideoResource', 'AudioResource'): + duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) + restricted_to_denmark = asset.get('RestrictedToDenmark') + spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' + for link in asset.get('Links', []): + uri = link.get('Uri') + if not uri: + continue + target = link.get('Target') + format_id = target or '' preference = None if spoken_subtitles: preference = -1 format_id += '-spoken-subtitles' if target == 'HDS': - formats.extend(self._extract_f4m_formats( + f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', - video_id, preference, f4m_id=format_id)) + video_id, preference, f4m_id=format_id) + if kind == 'AudioResource': + for f in f4m_formats: + f['vcodec'] = 'none' + formats.extend(f4m_formats) elif target == 'HLS': formats.extend(self._extract_m3u8_formats( - uri, video_id, 'mp4', preference=preference, - m3u8_id=format_id)) + uri, video_id, 'mp4', entry_protocol='m3u8_native', + preference=preference, m3u8_id=format_id)) else: bitrate = link.get('Bitrate') if bitrate: @@ -85,8 +116,9 @@ class DRTVIE(InfoExtractor): formats.append({ 'url': uri, 'format_id': format_id, - 'tbr': bitrate, + 'tbr': int_or_none(bitrate), 'ext': link.get('FileFormat'), + 'vcodec': 'none' if kind == 'AudioResource' else None, }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): @@ -94,12 +126,18 @@ class DRTVIE(InfoExtractor): 'Danish': 'da', } for subs in subtitles_list: - lang = subs['Language'] - subtitles[LANGS.get(lang, lang)] = [{'url': subs['Uri'], 'ext': 'vtt'}] + if not subs.get('Uri'): + continue + lang = subs.get('Language') or 'da' + subtitles.setdefault(LANGS.get(lang, lang), []).append({ + 'url': subs['Uri'], + 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' + }) if not formats and restricted_to_denmark: - raise ExtractorError( - 'Unfortunately, DR is not allowed to show this program outside Denmark.', expected=True) + self.raise_geo_restricted( + 'Unfortunately, DR is not allowed to show this program outside Denmark.', + expected=True) self._sort_formats(formats) @@ -113,3 +151,58 @@ class DRTVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class DRTVLiveIE(InfoExtractor): + IE_NAME = 'drtv:live' + _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)' + _TEST = { + 'url': 'https://www.dr.dk/tv/live/dr1', + 'info_dict': { + 'id': 'dr1', + 'ext': 'mp4', + 'title': 're:^DR1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel_data = self._download_json( + 'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id, + channel_id) + title = self._live_title(channel_data['Title']) + + formats = [] + for streaming_server in channel_data.get('StreamingServers', []): + server = streaming_server.get('Server') + if not server: + continue + link_type = streaming_server.get('LinkType') + for quality in streaming_server.get('Qualities', []): + for stream in quality.get('Streams', []): + stream_path = stream.get('Stream') + if not stream_path: + continue + stream_url = update_url_query( + '%s/%s' % (server, stream_path), {'b': ''}) + if link_type == 'HLS': + formats.extend(self._extract_m3u8_formats( + stream_url, channel_id, 'mp4', + m3u8_id=link_type, fatal=False, live=True)) + elif link_type == 'HDS': + formats.extend(self._extract_f4m_formats(update_url_query( + '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}), + channel_id, f4m_id=link_type, fatal=False)) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': title, + 'thumbnail': channel_data.get('PrimaryImageUri'), + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py index e5aadcd25..c9fc9b5a9 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/youtube_dl/extractor/dumpert.py @@ -21,7 +21,7 @@ class DumpertIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ik heb nieuws voor je', 'description': 'Niet schrikken hoor', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'url': 'http://www.dumpert.nl/embed/6675421/dc440fe7/', diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 12d28d3b9..76d39adac 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -4,11 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, int_or_none, - url_basename, ) @@ -29,7 +31,7 @@ class EaglePlatformIE(InfoExtractor): 'ext': 'mp4', 'title': 'Навальный вышел на свободу', 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 87, 'view_count': int, 'age_limit': 0, @@ -43,7 +45,7 @@ class EaglePlatformIE(InfoExtractor): 'id': '12820', 'ext': 'mp4', 'title': "'O Sole Mio", - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 216, 'view_count': int, }, @@ -52,11 +54,24 @@ class EaglePlatformIE(InfoExtractor): @staticmethod def _extract_url(webpage): + # Regular iframe embedding mobj = re.search( r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', webpage) if mobj is not None: return mobj.group('url') + # Basic usage embedding (see http://dultonmedia.github.io/eplayer/) + mobj = re.search( + r'''(?xs) + <script[^>]+ + src=(?P<q1>["\'])(?:https?:)?//(?P<host>.+?\.media\.eagleplatform\.com)/player/player\.js(?P=q1) + .+? + <div[^>]+ + class=(?P<q2>["\'])eagleplayer(?P=q2)[^>]+ + data-id=["\'](?P<id>\d+) + ''', webpage) + if mobj is not None: + return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() @staticmethod def _handle_error(response): @@ -64,7 +79,7 @@ class EaglePlatformIE(InfoExtractor): if status != 200: raise ExtractorError(' '.join(response['errors']), expected=True) - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): + def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', *args, **kwargs): try: response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) except ExtractorError as ee: @@ -103,29 +118,38 @@ class EaglePlatformIE(InfoExtractor): m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON') m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, - 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) formats.extend(m3u8_formats) - mp4_url = self._get_video_url( + m3u8_formats_dict = {} + for f in m3u8_formats: + if f.get('height') is not None: + m3u8_formats_dict[f['height']] = f + + mp4_data = self._download_json( # Secure mp4 URL is constructed according to Player.prototype.mp4 from # http://lentaru.media.eagleplatform.com/player/player.js - re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4', secure_m3u8), - video_id, 'Downloading mp4 JSON') - mp4_url_basename = url_basename(mp4_url) - for m3u8_format in m3u8_formats: - mobj = re.search('/([^/]+)/index\.m3u8', m3u8_format['url']) - if mobj: - http_format = m3u8_format.copy() - video_url = mp4_url.replace(mp4_url_basename, mobj.group(1)) - if not self._is_valid_url(video_url, video_id): + re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4s', secure_m3u8), + video_id, 'Downloading mp4 JSON', fatal=False) + if mp4_data: + for format_id, format_url in mp4_data.get('data', {}).items(): + if not isinstance(format_url, compat_str): continue - http_format.update({ - 'url': video_url, - 'format_id': m3u8_format['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(http_format) + height = int_or_none(format_id) + if height is not None and m3u8_formats_dict.get(height): + f = m3u8_formats_dict[height].copy() + f.update({ + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + else: + f = { + 'format_id': 'http-%s' % format_id, + 'height': int_or_none(format_id), + } + f['url'] = format_url + formats.append(f) self._sort_formats(formats) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py new file mode 100644 index 000000000..db921465e --- /dev/null +++ b/youtube_dl/extractor/egghead.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class EggheadCourseIE(InfoExtractor): + IE_DESC = 'egghead.io course' + IE_NAME = 'egghead:course' + _VALID_URL = r'https://egghead\.io/courses/(?P<id>[a-zA-Z_0-9-]+)' + _TEST = { + 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', + 'playlist_count': 29, + 'info_dict': { + 'id': 'professor-frisby-introduces-composable-functional-javascript', + 'title': 'Professor Frisby Introduces Composable Functional JavaScript', + 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', + }, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + title = self._html_search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'title') + ul = self._search_regex(r'(?s)<ul class="series-lessons-list">(.*?)</ul>', webpage, 'session list') + + found = re.findall(r'(?s)<a class="[^"]*"\s*href="([^"]+)">\s*<li class="item', ul) + entries = [self.url_result(m) for m in found] + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'entries': entries, + } diff --git a/youtube_dl/extractor/einthusan.py b/youtube_dl/extractor/einthusan.py index f7339702c..6ca07a13d 100644 --- a/youtube_dl/extractor/einthusan.py +++ b/youtube_dl/extractor/einthusan.py @@ -14,24 +14,24 @@ class EinthusanIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.einthusan.com/movies/watch.php?id=2447', - 'md5': 'af244f4458cd667205e513d75da5b8b1', + 'md5': 'd71379996ff5b7f217eca034c34e3461', 'info_dict': { 'id': '2447', 'ext': 'mp4', 'title': 'Ek Villain', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:9d29fc91a7abadd4591fb862fa560d93', } }, { 'url': 'http://www.einthusan.com/movies/watch.php?id=1671', - 'md5': 'ef63c7a803e22315880ed182c10d1c5c', + 'md5': 'b16a6fd3c67c06eb7c79c8a8615f4213', 'info_dict': { 'id': '1671', 'ext': 'mp4', 'title': 'Soodhu Kavvuum', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:05d8a0c0281a4240d86d76e14f2f4d51', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:b40f2bf7320b4f9414f3780817b2af8c', } }, ] @@ -50,9 +50,11 @@ class EinthusanIE(InfoExtractor): video_id = self._search_regex( r'data-movieid=["\'](\d+)', webpage, 'video id', default=video_id) - video_url = self._download_webpage( + m3u8_url = self._download_webpage( 'http://cdn.einthusan.com/geturl/%s/hd/London,Washington,Toronto,Dallas,San,Sydney/' - % video_id, video_id) + % video_id, video_id, headers={'Referer': url}) + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native') description = self._html_search_meta('description', webpage) thumbnail = self._html_search_regex( @@ -64,7 +66,7 @@ class EinthusanIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, 'thumbnail': thumbnail, 'description': description, } diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 713cb7b32..ee5ead18b 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py index 8c725a4e6..99e00cf3c 100644 --- a/youtube_dl/extractor/elpais.py +++ b/youtube_dl/extractor/elpais.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import strip_jsonp, unified_strdate class ElPaisIE(InfoExtractor): @@ -29,6 +29,16 @@ class ElPaisIE(InfoExtractor): 'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.', 'upload_date': '20160303', } + }, { + 'url': 'http://elpais.com/elpais/2017/01/26/ciencia/1485456786_417876.html', + 'md5': '9c79923a118a067e1a45789e1e0b0f9c', + 'info_dict': { + 'id': '1485456786_417876', + 'ext': 'mp4', + 'title': 'Hallado un barco de la antigua Roma que naufragó en Baleares hace 1.800 años', + 'description': 'La nave portaba cientos de ánforas y se hundió cerca de la isla de Cabrera por razones desconocidas', + 'upload_date': '20170127', + }, }] def _real_extract(self, url): @@ -37,8 +47,15 @@ class ElPaisIE(InfoExtractor): prefix = self._html_search_regex( r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix') - video_suffix = self._search_regex( - r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL') + id_multimedia = self._search_regex( + r"id_multimedia\s*=\s*'([^']+)'", webpage, 'ID multimedia', default=None) + if id_multimedia: + url_info = self._download_json( + 'http://elpais.com/vdpep/1/?pepid=' + id_multimedia, video_id, transform_source=strip_jsonp) + video_suffix = url_info['mp4'] + else: + video_suffix = self._search_regex( + r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL') video_url = prefix + video_suffix thumbnail_suffix = self._search_regex( r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", diff --git a/youtube_dl/extractor/embedly.py b/youtube_dl/extractor/embedly.py index 1cdb11e34..a5820b21e 100644 --- a/youtube_dl/extractor/embedly.py +++ b/youtube_dl/extractor/embedly.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index a39e9010d..65635c18b 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class EngadgetIE(InfoExtractor): - _VALID_URL = r'https?://www.engadget.com/video/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?engadget\.com/video/(?P<id>[^/?#]+)' _TESTS = [{ # video with 5min ID diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 297f8a6f5..c08643a17 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -22,7 +22,7 @@ class EroProfileIE(InfoExtractor): 'display_id': 'sexy-babe-softcore', 'ext': 'm4v', 'title': 'sexy babe softcore', - 'thumbnail': 're:https?://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, } }, { @@ -32,7 +32,7 @@ class EroProfileIE(InfoExtractor): 'id': '1133519', 'ext': 'm4v', 'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file', - 'thumbnail': 're:https?://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, }, 'skip': 'Requires login', diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index a3d7bbbcb..4d8a3c134 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -45,7 +45,7 @@ class EscapistIE(InfoExtractor): 'ext': 'mp4', 'description': "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", 'title': "Breaking Down Baldur's Gate", - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 264, 'uploader': 'The Escapist', } @@ -57,7 +57,7 @@ class EscapistIE(InfoExtractor): 'ext': 'mp4', 'description': 'This week, Zero Punctuation reviews Evolve.', 'title': 'Evolve - One vs Multiplayer', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 304, 'uploader': 'The Escapist', } diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 66c08bec4..8795e0ddf 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -1,38 +1,117 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + unified_timestamp, +) class ESPNIE(InfoExtractor): - _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/video/clip(?:\?.*?\bid=|/_/id/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://espn.go.com/video/clip?id=10365079', - 'md5': '60e5d097a523e767d06479335d1bdc58', 'info_dict': { - 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', + 'id': '10365079', 'ext': 'mp4', 'title': '30 for 30 Shorts: Judging Jewell', - 'description': None, + 'description': 'md5:39370c2e016cb4ecf498ffe75bef7f0f', + 'timestamp': 1390936111, + 'upload_date': '20140128', }, 'params': { 'skip_download': True, }, - 'add_ie': ['OoyalaExternal'], }, { # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season 'url': 'http://espn.go.com/video/clip?id=2743663', - 'md5': 'f4ac89b59afc7e2d7dbb049523df6768', 'info_dict': { - 'id': '50NDFkeTqRHB0nXBOK-RGdSG5YQPuxHg', + 'id': '2743663', 'ext': 'mp4', 'title': 'Must-See Moments: Best of the MLS season', + 'description': 'md5:4c2d7232beaea572632bec41004f0aeb', + 'timestamp': 1449446454, + 'upload_date': '20151207', }, 'params': { 'skip_download': True, }, - 'add_ie': ['OoyalaExternal'], + 'expected_warnings': ['Unable to download f4m manifest'], }, { + 'url': 'http://www.espn.com/video/clip?id=10365079', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/video/clip/_/id/17989860', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + clip = self._download_json( + 'http://api-app.espn.com/v1/video/clips/%s' % video_id, + video_id)['videos'][0] + + title = clip['headline'] + + format_urls = set() + formats = [] + + def traverse_source(source, base_source_id=None): + for source_id, source in source.items(): + if isinstance(source, compat_str): + extract_source(source, base_source_id) + elif isinstance(source, dict): + traverse_source( + source, + '%s-%s' % (base_source_id, source_id) + if base_source_id else source_id) + + def extract_source(source_url, source_id=None): + if source_url in format_urls: + return + format_urls.add(source_url) + ext = determine_ext(source_url) + if ext == 'smil': + formats.extend(self._extract_smil_formats( + source_url, video_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, video_id, f4m_id=source_id, fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=source_id, fatal=False)) + else: + formats.append({ + 'url': source_url, + 'format_id': source_id, + }) + + traverse_source(clip['links']['source']) + self._sort_formats(formats) + + description = clip.get('caption') or clip.get('description') + thumbnail = clip.get('thumbnail') + duration = int_or_none(clip.get('duration')) + timestamp = unified_timestamp(clip.get('originalPublishDate')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } + + +class ESPNArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P<id>[^/]+)' + _TESTS = [{ 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', 'only_matching': True, }, { @@ -49,6 +128,10 @@ class ESPNIE(InfoExtractor): 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if ESPNIE.suitable(url) else super(ESPNArticleIE, cls).suitable(url) + def _real_extract(self, url): video_id = self._match_id(url) @@ -58,23 +141,5 @@ class ESPNIE(InfoExtractor): r'class=(["\']).*?video-play-button.*?\1[^>]+data-id=["\'](?P<id>\d+)', webpage, 'video id', group='id') - cms = 'espn' - if 'data-source="intl"' in webpage: - cms = 'intl' - player_url = 'https://espn.go.com/video/iframe/twitter/?id=%s&cms=%s' % (video_id, cms) - player = self._download_webpage( - player_url, video_id) - - pcode = self._search_regex( - r'["\']pcode=([^"\']+)["\']', player, 'pcode') - - title = remove_end( - self._og_search_title(webpage), - '- ESPN Video').strip() - - return { - '_type': 'url_transparent', - 'url': 'ooyalaexternal:%s:%s:%s' % (cms, video_id, pcode), - 'ie_key': 'OoyalaExternal', - 'title': title, - } + return self.url_result( + 'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) diff --git a/youtube_dl/extractor/esri.py b/youtube_dl/extractor/esri.py index d4205d7fb..e9dcaeb1d 100644 --- a/youtube_dl/extractor/esri.py +++ b/youtube_dl/extractor/esri.py @@ -22,7 +22,7 @@ class EsriVideoIE(InfoExtractor): 'ext': 'mp4', 'title': 'ArcGIS Online - Developing Applications', 'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 185, 'upload_date': '20120419', } diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py index adc43919e..1efc0b2ec 100644 --- a/youtube_dl/extractor/europa.py +++ b/youtube_dl/extractor/europa.py @@ -23,7 +23,7 @@ class EuropaIE(InfoExtractor): 'ext': 'mp4', 'title': 'TRADE - Wikileaks on TTIP', 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150811', 'duration': 34, 'view_count': int, diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py deleted file mode 100644 index 09ed4f2b5..000000000 --- a/youtube_dl/extractor/exfm.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class ExfmIE(InfoExtractor): - IE_NAME = 'exfm' - IE_DESC = 'ex.fm' - _VALID_URL = r'https?://(?:www\.)?ex\.fm/song/(?P<id>[^/]+)' - _SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' - _TESTS = [ - { - 'url': 'http://ex.fm/song/eh359', - 'md5': 'e45513df5631e6d760970b14cc0c11e7', - 'info_dict': { - 'id': '44216187', - 'ext': 'mp3', - 'title': 'Test House "Love Is Not Enough" (Extended Mix) DeadJournalist Exclusive', - 'uploader': 'deadjournalist', - 'upload_date': '20120424', - 'description': 'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive', - }, - 'note': 'Soundcloud song', - 'skip': 'The site is down too often', - }, - { - 'url': 'http://ex.fm/song/wddt8', - 'md5': '966bd70741ac5b8570d8e45bfaed3643', - 'info_dict': { - 'id': 'wddt8', - 'ext': 'mp3', - 'title': 'Safe and Sound', - 'uploader': 'Capital Cities', - }, - 'skip': 'The site is down too often', - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - song_id = mobj.group('id') - info_url = 'http://ex.fm/api/v3/song/%s' % song_id - info = self._download_json(info_url, song_id)['song'] - song_url = info['url'] - if re.match(self._SOUNDCLOUD_URL, song_url) is not None: - self.to_screen('Soundcloud song detected') - return self.url_result(song_url.replace('/stream', ''), 'Soundcloud') - return { - 'id': song_id, - 'url': song_url, - 'ext': 'mp3', - 'title': info['title'], - 'thumbnail': info['image']['large'], - 'uploader': info['artist'], - 'view_count': info['loved_count'], - } diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py index 1585a03bb..95a897782 100644 --- a/youtube_dl/extractor/expotv.py +++ b/youtube_dl/extractor/expotv.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -10,25 +8,24 @@ from ..utils import ( class ExpoTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])' + _VALID_URL = r'https?://(?:www\.)?expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])' _TEST = { - 'url': 'http://www.expotv.com/videos/reviews/1/24/LinneCardscom/17561', - 'md5': '2985e6d7a392b2f7a05e0ca350fe41d0', + 'url': 'http://www.expotv.com/videos/reviews/3/40/NYX-Butter-lipstick/667916', + 'md5': 'fe1d728c3a813ff78f595bc8b7a707a8', 'info_dict': { - 'id': '17561', + 'id': '667916', 'ext': 'mp4', - 'upload_date': '20060212', - 'title': 'My Favorite Online Scrapbook Store', + 'title': 'NYX Butter Lipstick Little Susie', + 'description': 'Goes on like butter, but looks better!', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Stephanie S.', + 'upload_date': '20150520', 'view_count': int, - 'description': 'You\'ll find most everything you need at this virtual store front.', - 'uploader': 'Anna T.', - 'thumbnail': 're:^https?://.*\.jpg$', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) player_key = self._search_regex( @@ -66,7 +63,7 @@ class ExpoTVIE(InfoExtractor): fatal=False) upload_date = unified_strdate(self._search_regex( r'<h5>Reviewed on ([0-9/.]+)</h5>', webpage, 'upload date', - fatal=False)) + fatal=False), day_first=False) return { 'id': video_id, diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 17d12e315..b97f67b97 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1,12 +1,18 @@ # flake8: noqa from __future__ import unicode_literals -from .abc import ABCIE -from .abc7news import Abc7NewsIE +from .abc import ( + ABCIE, + ABCIViewIE, +) from .abcnews import ( AbcNewsIE, AbcNewsVideoIE, ) +from .abcotvs import ( + ABCOTVSIE, + ABCOTVSClipsIE, +) from .academicearth import AcademicEarthCourseIE from .acast import ( ACastIE, @@ -24,18 +30,18 @@ from .aenetworks import ( AENetworksIE, HistoryTopicIE, ) -from .afreecatv import AfreecaTVIE -from .aftonbladet import AftonbladetIE +from .afreecatv import ( + AfreecaTVIE, + AfreecaTVGlobalIE, +) from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amcnetworks import AMCNetworksIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anysex import AnySexIE -from .aol import ( - AolIE, - AolFeaturesIE, -) +from .aol import AolIE from .allocine import AllocineIE from .aparat import AparatIE from .appleconnect import AppleConnectIE @@ -60,6 +66,7 @@ from .arte import ( ArteTVDDCIE, ArteTVMagazineIE, ArteTVEmbedIE, + TheOperaPlatformIE, ArteTVPlaylistIE, ) from .atresplayer import AtresPlayerIE @@ -67,6 +74,16 @@ from .atttechchannel import ATTTechChannelIE from .audimedia import AudiMediaIE from .audioboom import AudioBoomIE from .audiomack import AudiomackIE, AudiomackAlbumIE +from .awaan import ( + AWAANIE, + AWAANVideoIE, + AWAANLiveIE, + AWAANSeasonIE, +) +from .azmedien import ( + AZMedienIE, + AZMedienPlaylistIE, +) from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE @@ -78,13 +95,18 @@ from .bbc import ( BBCCoUkPlaylistIE, BBCIE, ) +from .beampro import BeamProLiveIE from .beeg import BeegIE from .behindkink import BehindKinkIE -from .beatportpro import BeatportProIE +from .bellmedia import BellMediaIE +from .beatport import BeatportIE from .bet import BetIE from .bigflix import BigflixIE from .bild import BildIE -from .bilibili import BiliBiliIE +from .bilibili import ( + BiliBiliIE, + BiliBiliBangumiIE, +) from .biobiochiletv import BioBioChileTVIE from .biqle import BIQLEIE from .bleacherreport import ( @@ -103,7 +125,10 @@ from .brightcove import ( BrightcoveNewIE, ) from .buzzfeed import BuzzFeedIE -from .byutv import BYUtvIE +from .byutv import ( + BYUtvIE, + BYUtvEventIE, +) from .c56 import C56IE from .camdemy import ( CamdemyIE, @@ -117,9 +142,12 @@ from .carambatv import ( CarambaTVIE, CarambaTVPageIE, ) +from .cartoonnetwork import CartoonNetworkIE from .cbc import ( CBCIE, CBCPlayerIE, + CBCWatchVideoIE, + CBCWatchIE, ) from .cbs import CBSIE from .cbslocal import CBSLocalIE @@ -130,9 +158,12 @@ from .cbsnews import ( ) from .cbssports import CBSSportsIE from .ccc import CCCIE +from .ccma import CCMAIE +from .cctv import CCTVIE from .cda import CDAIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE +from .charlierose import CharlieRoseIE from .chaturbate import ChaturbateIE from .chilloutzone import ChilloutzoneIE from .chirbit import ( @@ -158,6 +189,7 @@ from .cnn import ( from .coub import CoubIE from .collegerama import CollegeRamaIE from .comedycentral import ( + ComedyCentralFullEpisodesIE, ComedyCentralIE, ComedyCentralShortnameIE, ComedyCentralTVIE, @@ -165,7 +197,10 @@ from .comedycentral import ( ) from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE -from .commonprotocols import RtmpIE +from .commonprotocols import ( + MmsIE, + RtmpIE, +) from .condenast import CondeNastIE from .cracked import CrackedIE from .crackle import CrackleIE @@ -177,9 +212,12 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE -from .ctv import CTVIE from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE +from .curiositystream import ( + CuriosityStreamIE, + CuriosityStreamCollectionIE, +) from .cwtv import CWTVIE from .dailymail import DailyMailIE from .dailymotion import ( @@ -195,12 +233,6 @@ from .daum import ( DaumUserIE, ) from .dbtv import DBTVIE -from .dcn import ( - DCNIE, - DCNVideoIE, - DCNLiveIE, - DCNSeasonIE, -) from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE @@ -216,11 +248,16 @@ from .dramafever import ( from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE -from .drtv import DRTVIE +from .drtv import ( + DRTVIE, + DRTVLiveIE, +) from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE +from .discoverygo import DiscoveryGoIE +from .disney import DisneyIE from .dispeak import DigitallySpeakingIE from .dropbox import DropboxIE from .dw import ( @@ -230,6 +267,7 @@ from .dw import ( from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE +from .egghead import EggheadCourseIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .einthusan import EinthusanIE @@ -244,18 +282,30 @@ from .engadget import EngadgetIE from .eporner import EpornerIE from .eroprofile import EroProfileIE from .escapist import EscapistIE -from .espn import ESPNIE +from .espn import ( + ESPNIE, + ESPNArticleIE, +) from .esri import EsriVideoIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE -from .exfm import ExfmIE from .expotv import ExpoTVIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE -from .facebook import FacebookIE +from .facebook import ( + FacebookIE, + FacebookPluginsVideoIE, +) from .faz import FazIE -from .fc2 import FC2IE +from .fc2 import ( + FC2IE, + FC2EmbedIE, +) from .fczenit import FczenitIE +from .filmon import ( + FilmOnIE, + FilmOnChannelIE, +) from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE @@ -268,13 +318,15 @@ from .footyroom import FootyRoomIE from .formula1 import Formula1IE from .fourtube import FourTubeIE from .fox import FOXIE +from .fox9 import FOX9IE from .foxgay import FoxgayIE -from .foxnews import FoxNewsIE -from .foxsports import FoxSportsIE -from .franceculture import ( - FranceCultureIE, - FranceCultureEmissionIE, +from .foxnews import ( + FoxNewsIE, + FoxNewsArticleIE, + FoxNewsInsiderIE, ) +from .foxsports import FoxSportsIE +from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( PluzzIE, @@ -285,10 +337,10 @@ from .francetv import ( ) from .freesound import FreesoundIE from .freespeech import FreespeechIE -from .freevideo import FreeVideoIE from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE from .fusion import FusionIE +from .fxnetworks import FXNetworksIE from .gameinformer import GameInformerIE from .gameone import ( GameOneIE, @@ -308,10 +360,10 @@ from .globo import ( GloboIE, GloboArticleIE, ) +from .go import GoIE from .go90 import Go90IE from .godtube import GodTubeIE from .godtv import GodTVIE -from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE @@ -320,14 +372,22 @@ from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE from .groupon import GrouponIE from .hark import HarkIE -from .hbo import HBOIE +from .hbo import ( + HBOIE, + HBOEpisodeIE, +) from .hearthisat import HearThisAtIE from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE +from .hgtv import ( + HGTVIE, + HGTVComShowIE, +) from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE +from .hitrecord import HitRecordIE from .hornbunny import HornBunnyIE from .hotnewhiphop import HotNewHipHopIE from .hotstar import HotStarIE @@ -337,6 +397,7 @@ from .hrti import ( HRTiIE, HRTiPlaylistIE, ) +from .huajiao import HuajiaoIE from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE @@ -354,6 +415,7 @@ from .imgur import ( ImgurAlbumIE, ) from .ina import InaIE +from .inc import IncIE from .indavideo import ( IndavideoIE, IndavideoEmbedIE, @@ -364,12 +426,18 @@ from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .iqiyi import IqiyiIE from .ir90tv import Ir90TvIE +from .itv import ITVIE from .ivi import ( IviIE, IviCompilationIE ) from .ivideon import IvideonIE +from .iwara import IwaraIE from .izlesene import IzleseneIE +from .jamendo import ( + JamendoIE, + JamendoAlbumIE, +) from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .jwplatform import JWPlatformIE @@ -381,6 +449,7 @@ from .kankan import KankanIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE +from .ketnet import KetnetIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE from .keek import KeekIE @@ -398,13 +467,18 @@ from .kuwo import ( KuwoMvIE, ) from .la7 import LA7IE -from .laola1tv import Laola1TvIE +from .laola1tv import ( + Laola1TvEmbedIE, + Laola1TvIE, +) +from .lci import LCIIE from .lcp import ( LcpPlayIE, LcpIE, ) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE +from .lego import LEGOIE from .lemonde import LemondeIE from .leeco import ( LeIE, @@ -442,13 +516,20 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .makerschannel import MakersChannelIE from .makertv import MakerTVIE +from .mangomolo import ( + MangomoloVideoIE, + MangomoloLiveIE, +) from .matchtv import MatchTVIE from .mdr import MDRIE +from .meipai import MeipaiIE +from .melonvod import MelonVODIE from .meta import METAIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE from .mgtv import MGTVIE +from .miaopai import MiaoPaiIE from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, @@ -477,11 +558,14 @@ from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE +from .movingimage import MovingImageIE from .msn import MSNIE from .mtv import ( MTVIE, + MTVVideoIE, MTVServicesEmbeddedIE, MTVDEIE, + MTV81IE, ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE @@ -502,6 +586,7 @@ from .nbc import ( CSNNEIE, NBCIE, NBCNewsIE, + NBCOlympicsIE, NBCSportsIE, NBCSportsVPlayerIE, ) @@ -530,9 +615,11 @@ from .nextmedia import ( NextMediaIE, NextMediaActionNewsIE, AppleDailyIE, + NextTVIE, ) from .nfb import NFBIE from .nfl import NFLIE +from .nhk import NhkVodIE from .nhl import ( NHLVideocenterIE, NHLNewsIE, @@ -542,12 +629,17 @@ from .nhl import ( from .nick import ( NickIE, NickDeIE, + NickNightIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninecninemedia import NineCNineMediaIE +from .ninecninemedia import ( + NineCNineMediaStackIE, + NineCNineMediaIE, +) from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE +from .nobelprize import NobelPrizeIE from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE @@ -570,13 +662,14 @@ from .nowtv import ( ) from .noz import NozIE from .npo import ( + AndereTijdenIE, NPOIE, NPOLiveIE, NPORadioIE, NPORadioFragmentIE, SchoolTVIE, VPROIE, - WNLIE + WNLIE, ) from .npr import NprIE from .nrk import ( @@ -584,6 +677,9 @@ from .nrk import ( NRKPlaylistIE, NRKSkoleIE, NRKTVIE, + NRKTVDirekteIE, + NRKTVEpisodesIE, + NRKTVSeriesIE, ) from .ntvde import NTVDeIE from .ntvru import NTVRuIE @@ -592,9 +688,11 @@ from .nytimes import ( NYTimesArticleIE, ) from .nuvid import NuvidIE +from .nzz import NZZIE from .odatv import OdaTVIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE +from .ondemandkorea import OnDemandKoreaIE from .onet import ( OnetIE, OnetChannelIE, @@ -612,6 +710,7 @@ from .orf import ( ORFFM4IE, ORFIPTVIE, ) +from .pandatv import PandaTVIE from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE @@ -624,9 +723,9 @@ from .periscope import ( from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .piksel import PikselIE from .pinkbike import PinkbikeIE from .pladform import PladformIE -from .played import PlayedIE from .playfm import PlayFMIE from .plays import PlaysTVIE from .playtvak import PlaytvakIE @@ -638,8 +737,13 @@ from .pluralsight import ( ) from .podomatic import PodomaticIE from .pokemon import PokemonIE -from .polskieradio import PolskieRadioIE +from .polskieradio import ( + PolskieRadioIE, + PolskieRadioCategoryIE, +) from .porn91 import Porn91IE +from .porncom import PornComIE +from .pornflip import PornFlipIE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, @@ -682,6 +786,10 @@ from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redtube import RedTubeIE from .regiotv import RegioTVIE +from .rentv import ( + RENTVIE, + RENTVArticleIE, +) from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE @@ -691,11 +799,13 @@ from .revision3 import ( ) from .rice import RICEIE from .ringtv import RingTVIE +from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE from .roosterteeth import RoosterTeethIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE +from .rozhlas import RozhlasIE from .rtbf import RTBFIE from .rte import RteIE, RteRadioIE from .rtlnl import RtlNlIE @@ -728,16 +838,17 @@ from .sbs import SBSIE from .scivee import SciVeeIE from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE -from .screenjunkies import ScreenJunkiesIE -from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .seeker import SeekerIE from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE from .servingsys import ServingSysIE from .sexu import SexuIE from .shahid import ShahidIE -from .shared import SharedIE -from .sharesix import ShareSixIE +from .shared import ( + SharedIE, + VivoIE, +) +from .showroomlive import ShowRoomLiveIE from .sina import SinaIE from .sixplay import SixPlayIE from .skynewsarabia import ( @@ -755,6 +866,7 @@ from .smotri import ( ) from .snotr import SnotrIE from .sohu import SohuIE +from .sonyliv import SonyLIVIE from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -780,10 +892,7 @@ from .spiegeltv import SpiegeltvIE from .spike import SpikeIE from .stitcher import StitcherIE from .sport5 import Sport5IE -from .sportbox import ( - SportBoxIE, - SportBoxEmbedIE, -) +from .sportbox import SportBoxEmbedIE from .sportdeutschland import SportDeutschlandIE from .sportschau import SportschauIE from .srgssr import ( @@ -791,7 +900,6 @@ from .srgssr import ( SRGSSRPlayIE, ) from .srmediathek import SRMediathekIE -from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamable import StreamableIE @@ -810,8 +918,8 @@ from .tagesschau import ( TagesschauPlayerIE, TagesschauIE, ) -from .tapely import TapelyIE from .tass import TassIE +from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachertube import ( TeacherTubeIE, @@ -819,6 +927,7 @@ from .teachertube import ( ) from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE +from .teamfourstar import TeamFourStarIE from .techtalks import TechTalksIE from .ted import TEDIE from .tele13 import Tele13IE @@ -826,10 +935,12 @@ from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE from .telemb import TeleMBIE +from .telequebec import TeleQuebecIE from .teletask import TeleTaskIE from .telewebion import TelewebionIE from .testurl import TestURLIE from .tf1 import TF1IE +from .tfo import TFOIE from .theintercept import TheInterceptIE from .theplatform import ( ThePlatformIE, @@ -838,8 +949,10 @@ from .theplatform import ( from .thescene import TheSceneIE from .thesixtyone import TheSixtyOneIE from .thestar import TheStarIE +from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE +from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE from .tinypic import TinyPicIE from .tlc import TlcDeIE @@ -854,16 +967,12 @@ from .tnaflix import ( MovieFapIE, ) from .toggle import ToggleIE -from .thvideo import ( - THVideoIE, - THVideoPlaylistIE -) +from .tonline import TOnlineIE from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE -from .trollvids import TrollvidsIE -from .trutube import TruTubeIE +from .trutv import TruTVIE from .tube8 import Tube8IE from .tubitv import TubiTvIE from .tudou import ( @@ -887,18 +996,27 @@ from .tv2 import ( ) from .tv3 import TV3IE from .tv4 import TV4IE +from .tva import TVAIE +from .tvanouvelles import ( + TVANouvellesIE, + TVANouvellesArticleIE, +) from .tvc import ( TVCIE, TVCArticleIE, ) from .tvigle import TvigleIE from .tvland import TVLandIE +from .tvnoe import TVNoeIE from .tvp import ( TVPEmbedIE, TVPIE, TVPSeriesIE, ) -from .tvplay import TVPlayIE +from .tvplay import ( + TVPlayIE, + ViafreeIE, +) from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE @@ -911,7 +1029,10 @@ from .twitch import ( TwitchChapterIE, TwitchVodIE, TwitchProfileIE, + TwitchAllVideosIE, + TwitchUploadsIE, TwitchPastBroadcastsIE, + TwitchHighlightsIE, TwitchStreamIE, TwitchClipsIE, ) @@ -925,10 +1046,17 @@ from .udemy import ( UdemyCourseIE ) from .udn import UDNEmbedIE +from .uktvplay import UKTVPlayIE from .digiteka import DigitekaIE from .unistra import UnistraIE +from .uol import UOLIE +from .uplynk import ( + UplynkIE, + UplynkPreplayIE, +) from .urort import UrortIE from .urplay import URPlayIE +from .usanetwork import USANetworkIE from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( @@ -955,8 +1083,10 @@ from .vice import ( ViceIE, ViceShowIE, ) +from .viceland import VicelandIE from .vidbit import VidbitIE from .viddler import ViddlerIE +from .videa import VideaIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE @@ -966,7 +1096,6 @@ from .videomore import ( VideomoreSeasonIE, ) from .videopremium import VideoPremiumIE -from .videott import VideoTtIE from .vidio import VidioIE from .vidme import ( VidmeIE, @@ -1001,12 +1130,20 @@ from .viki import ( VikiIE, VikiChannelIE, ) +from .viu import ( + ViuIE, + ViuPlaylistIE, + ViuOTTIE, +) from .vk import ( VKIE, VKUserVideosIE, VKWallPostIE, ) -from .vlive import VLiveIE +from .vlive import ( + VLiveIE, + VLiveChannelIE +) from .vodlocker import VodlockerIE from .vodplatform import VODPlatformIE from .voicerepublic import VoiceRepublicIE @@ -1015,6 +1152,9 @@ from .vporn import VpornIE from .vrt import VRTIE from .vube import VubeIE from .vuclip import VuClipIE +from .vvvvid import VVVVIDIE +from .vyborymos import VyboryMosIE +from .vzaar import VzaarIE from .walla import WallaIE from .washingtonpost import ( WashingtonPostIE, @@ -1026,6 +1166,10 @@ from .wdr import ( WDRIE, WDRMobileIE, ) +from .webcaster import ( + WebcasterIE, + WebcasterFeedIE, +) from .webofstories import ( WebOfStoriesIE, WebOfStoriesPlaylistIE, @@ -1101,8 +1245,4 @@ from .youtube import ( ) from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE -from .zingmp3 import ( - ZingMp3SongIE, - ZingMp3AlbumIE, -) -from .zippcast import ZippCastIE +from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 3403581fd..445f9438d 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -1,20 +1,14 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - sanitized_Request, - str_to_int, -) +from ..utils import str_to_int +from .keezmovies import KeezMoviesIE -class ExtremeTubeIE(InfoExtractor): +class ExtremeTubeIE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'md5': '344d0c6d50e2f16b06e49ca011d8ac69', + 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', 'info_dict': { 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'ext': 'mp4', @@ -35,58 +29,22 @@ class ExtremeTubeIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + webpage, info = self._extract_info(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + if not info['title']: + info['title'] = self._search_regex( + r'<h1[^>]+title="([^"]+)"[^>]*>', webpage, 'title') - video_title = self._html_search_regex( - r'<h1 [^>]*?title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( r'Uploaded by:\s*</strong>\s*(.+?)\s*</div>', webpage, 'uploader', fatal=False) - view_count = str_to_int(self._html_search_regex( + view_count = str_to_int(self._search_regex( r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>', webpage, 'view count', fatal=False)) - flash_vars = self._parse_json( - self._search_regex( - r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'), - video_id) - - formats = [] - for quality_key, video_url in flash_vars.items(): - height = int_or_none(self._search_regex( - r'quality_(\d+)[pP]$', quality_key, 'height', default=None)) - if not height: - continue - f = { - 'url': video_url, - } - mobj = re.search( - r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url) - if mobj: - height = int(mobj.group('height')) - bitrate = int(mobj.group('bitrate')) - f.update({ - 'format_id': '%dp-%dk' % (height, bitrate), - 'height': height, - 'tbr': bitrate, - }) - else: - f.update({ - 'format_id': '%dp' % height, - 'height': height, - }) - formats.append(f) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_title, - 'formats': formats, + info.update({ 'uploader': uploader, 'view_count': view_count, - 'age_limit': 18, - } + }) + + return info diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 0fb781a73..b325c8200 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import json import re import socket @@ -13,13 +12,16 @@ from ..compat import ( compat_urllib_parse_unquote_plus, ) from ..utils import ( + clean_html, error_to_compat_str, ExtractorError, + get_element_by_id, + int_or_none, + js_to_json, limit_length, sanitized_Request, + try_get, urlencode_postdata, - get_element_by_id, - clean_html, ) @@ -27,7 +29,7 @@ class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?:// - (?:[\w-]+\.)?facebook\.com/ + (?:[\w-]+\.)?(?:facebook\.com|facebookcorewwwi\.onion)/ (?:[^#]*?\#!/)? (?: (?: @@ -62,6 +64,8 @@ class FacebookIE(InfoExtractor): 'ext': 'mp4', 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', 'uploader': 'Tennis on Facebook', + 'upload_date': '20140908', + 'timestamp': 1410199200, } }, { 'note': 'Video without discernible title', @@ -69,8 +73,10 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Facebook video #274175099429670', + 'title': 'Asif Nawab Butt posted a video to his Timeline.', 'uploader': 'Asif Nawab Butt', + 'upload_date': '20140506', + 'timestamp': 1399398998, }, 'expected_warnings': [ 'title' @@ -78,12 +84,14 @@ class FacebookIE(InfoExtractor): }, { 'note': 'Video with DASH manifest', 'url': 'https://www.facebook.com/video.php?v=957955867617029', - 'md5': '54706e4db4f5ad58fbad82dde1f1213f', + 'md5': 'b2c28d528273b323abe5c6ab59f0f030', 'info_dict': { 'id': '957955867617029', 'ext': 'mp4', 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', 'uploader': 'Demy de Zeeuw', + 'upload_date': '20160110', + 'timestamp': 1452431627, }, }, { 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', @@ -93,7 +101,8 @@ class FacebookIE(InfoExtractor): 'ext': 'mp4', 'title': '"What are you doing running in the snow?"', 'uploader': 'FailArmy', - } + }, + 'skip': 'Video gone', }, { 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3', @@ -103,6 +112,7 @@ class FacebookIE(InfoExtractor): 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog', 'uploader': 'S. Saint', }, + 'skip': 'Video gone', }, { 'note': 'swf params escaped', 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749', @@ -112,6 +122,18 @@ class FacebookIE(InfoExtractor): 'ext': 'mp4', 'title': 'Facebook video #10153664894881749', }, + }, { + # have 1080P, but only up to 720p in swf params + 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', + 'md5': '0d9813160b146b3bc8744e006027fcc6', + 'info_dict': { + 'id': '10155529876156509', + 'ext': 'mp4', + 'title': 'Holocaust survivor becomes US citizen', + 'timestamp': 1477818095, + 'upload_date': '20161030', + 'uploader': 'CNN', + }, }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -130,6 +152,9 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', 'only_matching': True, + }, { + 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', + 'only_matching': True, }] @staticmethod @@ -220,42 +245,30 @@ class FacebookIE(InfoExtractor): video_data = None - BEFORE = '{swf.addParam(param[0], param[1]);});' - AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' - PATTERN = re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER) + def extract_video_data(instances): + for item in instances: + if item[1][0] == 'VideoConfig': + video_item = item[2][0] + if video_item.get('video_id') == video_id: + return video_item['videoData'] - for m in re.findall(PATTERN, webpage): - swf_params = m.replace('\\\\', '\\').replace('\\"', '"') - data = dict(json.loads(swf_params)) - params_raw = compat_urllib_parse_unquote(data['params']) - video_data_candidate = json.loads(params_raw)['video_data'] - for _, f in video_data_candidate.items(): - if not f: - continue - if isinstance(f, dict): - f = [f] - if not isinstance(f, list): - continue - if f[0].get('video_id') == video_id: - video_data = video_data_candidate - break - if video_data: - break + server_js_data = self._parse_json(self._search_regex( + r'handleServerJS\(({.+})(?:\);|,")', webpage, + 'server js data', default='{}'), video_id, fatal=False) - def video_data_list2dict(video_data): - ret = {} - for item in video_data: - format_id = item['stream_type'] - ret.setdefault(format_id, []).append(item) - return ret + if server_js_data: + video_data = extract_video_data(server_js_data.get('instances', [])) if not video_data: - server_js_data = self._parse_json(self._search_regex( - r'handleServerJS\(({.+})\);', webpage, 'server js data', default='{}'), video_id) - for item in server_js_data.get('instances', []): - if item[1][0] == 'VideoConfig': - video_data = video_data_list2dict(item[2][0]['videoData']) - break + server_js_data = self._parse_json( + self._search_regex( + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+stream_pagelet', + webpage, 'js data', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if server_js_data: + video_data = extract_video_data(try_get( + server_js_data, lambda x: x['jsmods']['instances'], + list) or []) if not video_data: if not fatal_if_no_video: @@ -265,11 +278,14 @@ class FacebookIE(InfoExtractor): raise ExtractorError( 'The video is not available, Facebook said: "%s"' % m_msg.group(1), expected=True) + elif '>You must log in to continue' in webpage: + self.raise_login_required() else: raise ExtractorError('Cannot parse data') formats = [] - for format_id, f in video_data.items(): + for f in video_data: + format_id = f['stream_type'] if f and isinstance(f, dict): f = [f] if not f or not isinstance(f, list): @@ -302,16 +318,26 @@ class FacebookIE(InfoExtractor): video_title = self._html_search_regex( r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', webpage, 'alternative title', default=None) - video_title = limit_length(video_title, 80) if not video_title: + video_title = self._html_search_meta( + 'description', webpage, 'title') + if video_title: + video_title = limit_length(video_title, 80) + else: video_title = 'Facebook video #%s' % video_id - uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + uploader = clean_html(get_element_by_id( + 'fbPhotoPageAuthorName', webpage)) or self._search_regex( + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + timestamp = int_or_none(self._search_regex( + r'<abbr[^>]+data-utime=["\'](\d+)', webpage, + 'timestamp', default=None)) info_dict = { 'id': video_id, 'title': video_title, 'formats': formats, 'uploader': uploader, + 'timestamp': timestamp, } return webpage, info_dict @@ -340,3 +366,32 @@ class FacebookIE(InfoExtractor): self._VIDEO_PAGE_TEMPLATE % video_id, video_id, fatal_if_no_video=True) return info_dict + + +class FacebookPluginsVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?P<id>https.+)' + + _TESTS = [{ + 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560', + 'md5': '5954e92cdfe51fe5782ae9bda7058a07', + 'info_dict': { + 'id': '10154383743583686', + 'ext': 'mp4', + 'title': 'What to do during the haze?', + 'uploader': 'Gov.sg', + 'upload_date': '20160826', + 'timestamp': 1472184808, + }, + 'add_ie': [FacebookIE.ie_key()], + }, { + 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D10204634152394104', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/plugins/video.php?href=https://www.facebook.com/gov.sg/videos/10154383743583686/&show_text=0&width=560', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + compat_urllib_parse_unquote(self._match_id(url)), + FacebookIE.ie_key()) diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index fd535457d..4bc8fc512 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index c7d69ff1f..448647d72 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -1,10 +1,12 @@ -#! -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import hashlib +import re from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urllib_request, compat_urlparse, ) @@ -16,7 +18,7 @@ from ..utils import ( class FC2IE(InfoExtractor): - _VALID_URL = r'^https?://video\.fc2\.com/(?:[^/]+/)*content/(?P<id>[^/]+)' + _VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P<id>[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ @@ -75,12 +77,17 @@ class FC2IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) self._login() - webpage = self._download_webpage(url, video_id) - self._downloader.cookiejar.clear_session_cookies() # must clear - self._login() + webpage = None + if not url.startswith('fc2:'): + webpage = self._download_webpage(url, video_id) + self._downloader.cookiejar.clear_session_cookies() # must clear + self._login() - title = self._og_search_title(webpage) - thumbnail = self._og_search_thumbnail(webpage) + title = 'FC2 video %s' % video_id + thumbnail = None + if webpage is not None: + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest() @@ -113,3 +120,41 @@ class FC2IE(InfoExtractor): 'ext': 'flv', 'thumbnail': thumbnail, } + + +class FC2EmbedIE(InfoExtractor): + _VALID_URL = r'https?://video\.fc2\.com/flv2\.swf\?(?P<query>.+)' + IE_NAME = 'fc2:embed' + + _TEST = { + 'url': 'http://video.fc2.com/flv2.swf?t=201404182936758512407645&i=20130316kwishtfitaknmcgd76kjd864hso93htfjcnaogz629mcgfs6rbfk0hsycma7shkf85937cbchfygd74&i=201403223kCqB3Ez&d=2625&sj=11&lang=ja&rel=1&from=11&cmt=1&tk=TlRBM09EQTNNekU9&tl=プリズン・ブレイク%20S1-01%20マイケル%20【吹替】', + 'md5': 'b8aae5334cb691bdb1193a88a6ab5d5a', + 'info_dict': { + 'id': '201403223kCqB3Ez', + 'ext': 'flv', + 'title': 'プリズン・ブレイク S1-01 マイケル 【吹替】', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + query = compat_parse_qs(mobj.group('query')) + + video_id = query['i'][-1] + title = query.get('tl', ['FC2 video %s' % video_id])[0] + + sj = query.get('sj', [None])[0] + thumbnail = None + if sj: + # See thumbnailImagePath() in ServerConst.as of flv2.swf + thumbnail = 'http://video%s-thumbnail.fc2.com/up/pic/%s.jpg' % ( + sj, '/'.join((video_id[:6], video_id[6:8], video_id[-2], video_id[-1], video_id))) + + return { + '_type': 'url_transparent', + 'ie_key': FC2IE.ie_key(), + 'url': 'fc2:%s' % video_id, + 'title': title, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/filmon.py b/youtube_dl/extractor/filmon.py new file mode 100644 index 000000000..f775fe0ba --- /dev/null +++ b/youtube_dl/extractor/filmon.py @@ -0,0 +1,178 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_HTTPError, +) +from ..utils import ( + qualities, + strip_or_none, + int_or_none, + ExtractorError, +) + + +class FilmOnIE(InfoExtractor): + IE_NAME = 'filmon' + _VALID_URL = r'(?:https?://(?:www\.)?filmon\.com/vod/view/|filmon:)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.filmon.com/vod/view/24869-0-plan-9-from-outer-space', + 'info_dict': { + 'id': '24869', + 'ext': 'mp4', + 'title': 'Plan 9 From Outer Space', + 'description': 'Dead human, zombies and vampires', + }, + }, { + 'url': 'https://www.filmon.com/vod/view/2825-1-popeye-series-1', + 'info_dict': { + 'id': '2825', + 'title': 'Popeye Series 1', + 'description': 'The original series of Popeye.', + }, + 'playlist_mincount': 8, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + try: + response = self._download_json( + 'https://www.filmon.com/api/vod/movie?id=%s' % video_id, + video_id)['response'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + errmsg = self._parse_json(e.cause.read().decode(), video_id)['reason'] + raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) + raise + + title = response['title'] + description = strip_or_none(response.get('description')) + + if response.get('type_id') == 1: + entries = [self.url_result('filmon:' + episode_id) for episode_id in response.get('episodes', [])] + return self.playlist_result(entries, video_id, title, description) + + QUALITY = qualities(('low', 'high')) + formats = [] + for format_id, stream in response.get('streams', {}).items(): + stream_url = stream.get('url') + if not stream_url: + continue + formats.append({ + 'format_id': format_id, + 'url': stream_url, + 'ext': 'mp4', + 'quality': QUALITY(stream.get('quality')), + 'protocol': 'm3u8_native', + }) + self._sort_formats(formats) + + thumbnails = [] + poster = response.get('poster', {}) + thumbs = poster.get('thumbs', {}) + thumbs['poster'] = poster + for thumb_id, thumb in thumbs.items(): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'id': thumb_id, + 'url': thumb_url, + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'thumbnails': thumbnails, + } + + +class FilmOnChannelIE(InfoExtractor): + IE_NAME = 'filmon:channel' + _VALID_URL = r'https?://(?:www\.)?filmon\.com/(?:tv|channel)/(?P<id>[a-z0-9-]+)' + _TESTS = [{ + # VOD + 'url': 'http://www.filmon.com/tv/sports-haters', + 'info_dict': { + 'id': '4190', + 'ext': 'mp4', + 'title': 'Sports Haters', + 'description': 'md5:dabcb4c1d9cfc77085612f1a85f8275d', + }, + }, { + # LIVE + 'url': 'https://www.filmon.com/channel/filmon-sports', + 'only_matching': True, + }, { + 'url': 'https://www.filmon.com/tv/2894', + 'only_matching': True, + }] + + _THUMBNAIL_RES = [ + ('logo', 56, 28), + ('big_logo', 106, 106), + ('extra_big_logo', 300, 300), + ] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + try: + channel_data = self._download_json( + 'http://www.filmon.com/api-v2/channel/' + channel_id, channel_id)['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + errmsg = self._parse_json(e.cause.read().decode(), channel_id)['message'] + raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) + raise + + channel_id = compat_str(channel_data['id']) + is_live = not channel_data.get('is_vod') and not channel_data.get('is_vox') + title = channel_data['title'] + + QUALITY = qualities(('low', 'high')) + formats = [] + for stream in channel_data.get('streams', []): + stream_url = stream.get('url') + if not stream_url: + continue + if not is_live: + formats.extend(self._extract_wowza_formats( + stream_url, channel_id, skip_protocols=['dash', 'rtmp', 'rtsp'])) + continue + quality = stream.get('quality') + formats.append({ + 'format_id': quality, + # this is an m3u8 stream, but we are deliberately not using _extract_m3u8_formats + # because it doesn't have bitrate variants anyway + 'url': stream_url, + 'ext': 'mp4', + 'quality': QUALITY(quality), + }) + self._sort_formats(formats) + + thumbnails = [] + for name, width, height in self._THUMBNAIL_RES: + thumbnails.append({ + 'id': name, + 'url': 'http://static.filmon.com/assets/channels/%s/%s.png' % (channel_id, name), + 'width': width, + 'height': height, + }) + + return { + 'id': channel_id, + 'display_id': channel_data.get('alias'), + 'title': self._live_title(title) if is_live else title, + 'description': channel_data.get('description'), + 'thumbnails': thumbnails, + 'formats': formats, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 88bca1007..081c71842 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -1,46 +1,43 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_xpath +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( int_or_none, qualities, unified_strdate, - xpath_attr, - xpath_element, - xpath_text, - xpath_with_ns, ) class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ - # single format via video_materials.json API - 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', - 'md5': '82a2777648acae812d58b3f5bd42882b', + # single format + 'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', + 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', 'info_dict': { - 'id': '35930', + 'id': '40049', 'ext': 'mp4', - 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', - 'description': 'md5:357933adeede13b202c7c21f91b871b2', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', 'upload_date': '20150212', 'duration': 2694, }, }, { - # multiple formats via video_materials.json API - 'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641', + # multiple formats + 'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', 'info_dict': { - 'id': '113641', + 'id': '364746', 'ext': 'mp4', - 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', - 'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', 'upload_date': '20160407', 'duration': 179, 'formats': 'mincount:3', @@ -49,86 +46,108 @@ class FirstTVIE(InfoExtractor): 'skip_download': True, }, }, { - # single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API - 'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038', - 'md5': '519d306c5b5669761fd8906c39dbee23', + 'url': 'http://www.1tv.ru/news/issue/2016-12-01/14:00', 'info_dict': { - 'id': '47038', - 'ext': 'mp4', - 'title': '"Побег". Второй сезон. 3 серия', - 'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'upload_date': '20120516', - 'duration': 3080, + 'id': '14:00', + 'title': 'Выпуск новостей в 14:00 1 декабря 2016 года. Новости. Первый канал', + 'description': 'md5:2e921b948f8c1ff93901da78ebdb1dfd', }, + 'playlist_count': 13, }, { - 'url': 'http://www.1tv.ru/videoarchive/9967', + 'url': 'http://www.1tv.ru/shows/tochvtoch-supersezon/vystupleniya/evgeniy-dyatlov-vladimir-vysockiy-koni-priveredlivye-toch-v-toch-supersezon-fragment-vypuska-ot-06-11-2016', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - # Videos with multiple formats only available via this API - video = self._download_json( - 'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id, - video_id, fatal=False) + webpage = self._download_webpage(url, display_id) + playlist_url = compat_urlparse.urljoin(url, self._search_regex( + r'data-playlist-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'playlist url', group='url')) - description, thumbnail, upload_date, duration = [None] * 4 + parsed_url = compat_urlparse.urlparse(playlist_url) + qs = compat_urlparse.parse_qs(parsed_url.query) + item_ids = qs.get('videos_ids[]') or qs.get('news_ids[]') - if video: - item = video[0] - title = item['title'] - quality = qualities(('ld', 'sd', 'hd', )) - formats = [{ - 'url': f['src'], - 'format_id': f.get('name'), - 'quality': quality(f.get('name')), - } for f in item['mbr'] if f.get('src')] - thumbnail = item.get('poster') + items = self._download_json(playlist_url, display_id) + + if item_ids: + items = [ + item for item in items + if item.get('uid') and compat_str(item['uid']) in item_ids] else: - # Some videos are not available via video_materials.json - video = self._download_xml( - 'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id, - video_id) + items = [items[0]] - NS_MAP = { - 'media': 'http://search.yahoo.com/mrss/', - } + entries = [] + QUALITIES = ('ld', 'sd', 'hd', ) - item = xpath_element(video, './channel/item', fatal=True) - title = xpath_text(item, './title', fatal=True) - formats = [{ - 'url': content.attrib['url'], - } for content in item.findall( - compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')] - thumbnail = xpath_attr( - item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url') + for item in items: + title = item['title'] + quality = qualities(QUALITIES) + formats = [] + path = None + for f in item.get('mbr', []): + src = f.get('src') + if not src or not isinstance(src, compat_str): + continue + tbr = int_or_none(self._search_regex( + r'_(\d{3,})\.mp4', src, 'tbr', default=None)) + if not path: + path = self._search_regex( + r'//[^/]+/(.+?)_\d+\.mp4', src, + 'm3u8 path', default=None) + formats.append({ + 'url': src, + 'format_id': f.get('name'), + 'tbr': tbr, + 'source_preference': quality(f.get('name')), + }) + # m3u8 URL format is reverse engineered from [1] (search for + # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru) + # is taken from [2]. + # 1. http://static.1tv.ru/player/eump1tv-current/eump-1tv.all.min.js?rnd=9097422834:formatted + # 2. http://static.1tv.ru/player/eump1tv-config/config-main.js?rnd=9097422834 + if not path and len(formats) == 1: + path = self._search_regex( + r'//[^/]+/(.+?$)', formats[0]['url'], + 'm3u8 path', default=None) + if path: + if len(formats) == 1: + m3u8_path = ',' + else: + tbrs = [compat_str(t) for t in sorted(f['tbr'] for f in formats)] + m3u8_path = '_,%s,%s' % (','.join(tbrs), '.mp4') + formats.extend(self._extract_m3u8_formats( + 'http://balancer-vod.1tv.ru/%s%s.urlset/master.m3u8' + % (path, m3u8_path), + display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) - self._sort_formats(formats) - - webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False) - if webpage: - title = self._html_search_regex( - (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', - r"'title'\s*:\s*'([^']+)'"), - webpage, 'title', default=None) or title - description = self._html_search_regex( - r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', - webpage, 'description', default=None) or self._html_search_meta( - 'description', webpage, 'description') - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - duration = int_or_none(self._html_search_meta( + thumbnail = item.get('poster') or self._og_search_thumbnail(webpage) + duration = int_or_none(item.get('duration') or self._html_search_meta( 'video:duration', webpage, 'video duration', fatal=False)) upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) + 'ya:ovs:upload_date', webpage, 'upload date', default=None)) - return { - 'id': video_id, - 'thumbnail': thumbnail, - 'title': title, - 'description': description, - 'upload_date': upload_date, - 'duration': int_or_none(duration), - 'formats': formats - } + entries.append({ + 'id': compat_str(item.get('id') or item['uid']), + 'thumbnail': thumbnail, + 'title': title, + 'upload_date': upload_date, + 'duration': int_or_none(duration), + 'formats': formats + }) + + title = self._html_search_regex( + (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', + r"'title'\s*:\s*'([^']+)'"), + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) + description = self._html_search_regex( + r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', + webpage, 'description', default=None) or self._html_search_meta( + 'description', webpage, 'description', default=None) + + return self.playlist_result(entries, display_id, title, description) diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py index 13fbc4da2..15736c9fe 100644 --- a/youtube_dl/extractor/fivetv.py +++ b/youtube_dl/extractor/fivetv.py @@ -25,7 +25,7 @@ class FiveTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Россияне выбрали имя для общенациональной платежной системы', 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 180, }, }, { @@ -35,7 +35,7 @@ class FiveTVIE(InfoExtractor): 'ext': 'mp4', 'title': '3D принтер', 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 180, }, }, { @@ -44,7 +44,7 @@ class FiveTVIE(InfoExtractor): 'id': 'glavnoe', 'ext': 'mp4', 'title': 'Итоги недели с 8 по 14 июня 2015 года', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/', diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index a3a291599..2958452f4 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -19,7 +19,7 @@ class FKTVIE(InfoExtractor): 'id': '1', 'ext': 'mp4', 'title': 'Folge 1 vom 10. April 2007', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, } diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py index acb6133ff..b7be40f1b 100644 --- a/youtube_dl/extractor/flipagram.py +++ b/youtube_dl/extractor/flipagram.py @@ -48,7 +48,7 @@ class FlipagramIE(InfoExtractor): flipagram = video_data['flipagram'] video = flipagram['video'] - json_ld = self._search_json_ld(webpage, video_id, default=False) + json_ld = self._search_json_ld(webpage, video_id, default={}) title = json_ld.get('title') or flipagram['captionText'] description = json_ld.get('description') or flipagram.get('captionText') @@ -81,7 +81,7 @@ class FlipagramIE(InfoExtractor): 'filesize': int_or_none(cover.get('size')), } for cover in flipagram.get('covers', []) if cover.get('url')] - # Note that this only retrieves comments that are initally loaded. + # Note that this only retrieves comments that are initially loaded. # For videos with large amounts of comments, most won't be retrieved. comments = [] for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []): diff --git a/youtube_dl/extractor/folketinget.py b/youtube_dl/extractor/folketinget.py index 75399fa7d..b3df93f28 100644 --- a/youtube_dl/extractor/folketinget.py +++ b/youtube_dl/extractor/folketinget.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py index d2503ae2e..118325b6d 100644 --- a/youtube_dl/extractor/footyroom.py +++ b/youtube_dl/extractor/footyroom.py @@ -2,25 +2,27 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .streamable import StreamableIE class FootyRoomIE(InfoExtractor): - _VALID_URL = r'https?://footyroom\.com/(?P<id>[^/]+)' + _VALID_URL = r'https?://footyroom\.com/matches/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/', + 'url': 'http://footyroom.com/matches/79922154/hull-city-vs-chelsea/review', 'info_dict': { - 'id': 'schalke-04-0-2-real-madrid-2015-02', - 'title': 'Schalke 04 0 – 2 Real Madrid', + 'id': '79922154', + 'title': 'VIDEO Hull City 0 - 2 Chelsea', }, - 'playlist_count': 3, - 'skip': 'Video for this match is not available', + 'playlist_count': 2, + 'add_ie': [StreamableIE.ie_key()], }, { - 'url': 'http://footyroom.com/georgia-0-2-germany-2015-03/', + 'url': 'http://footyroom.com/matches/75817984/georgia-vs-germany/review', 'info_dict': { - 'id': 'georgia-0-2-germany-2015-03', - 'title': 'Georgia 0 – 2 Germany', + 'id': '75817984', + 'title': 'VIDEO Georgia 0 - 2 Germany', }, 'playlist_count': 1, + 'add_ie': ['Playwire'] }] def _real_extract(self, url): @@ -28,9 +30,8 @@ class FootyRoomIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - playlist = self._parse_json( - self._search_regex( - r'VideoSelector\.load\((\[.+?\])\);', webpage, 'video selector'), + playlist = self._parse_json(self._search_regex( + r'DataStore\.media\s*=\s*([^;]+)', webpage, 'media data'), playlist_id) playlist_title = self._og_search_title(webpage) @@ -40,11 +41,16 @@ class FootyRoomIE(InfoExtractor): payload = video.get('payload') if not payload: continue - playwire_url = self._search_regex( + playwire_url = self._html_search_regex( r'data-config="([^"]+)"', payload, 'playwire url', default=None) if playwire_url: entries.append(self.url_result(self._proto_relative_url( playwire_url, 'http:'), 'Playwire')) + streamable_url = StreamableIE._extract_url(payload) + if streamable_url: + entries.append(self.url_result( + streamable_url, StreamableIE.ie_key())) + return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py index 322c41e5a..fecfc28ae 100644 --- a/youtube_dl/extractor/formula1.py +++ b/youtube_dl/extractor/formula1.py @@ -5,17 +5,24 @@ from .common import InfoExtractor class Formula1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html' + _TESTS = [{ 'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html', 'md5': '8c79e54be72078b26b89e0e111c0502b', 'info_dict': { 'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Race highlights - Spain 2016', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'add_ie': ['Ooyala'], - } + }, { + 'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index fc4a5a0fb..9776c8422 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -43,14 +43,14 @@ class FourTubeIE(InfoExtractor): 'uploadDate', webpage)) thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( - r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', + r'<a class="item-to-subscribe" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', webpage, 'uploader id', fatal=False) uploader = self._html_search_regex( - r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', + r'<a class="item-to-subscribe" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', webpage, 'uploader', fatal=False) categories_html = self._search_regex( - r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>', + r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="[^"]*?list[^"]*?">(.*?)</ul>', webpage, 'categories', fatal=False) categories = None if categories_html: @@ -59,10 +59,10 @@ class FourTubeIE(InfoExtractor): r'(?s)<li><a.*?>(.*?)</a>', categories_html)] view_count = str_to_int(self._search_regex( - r'<meta itemprop="interactionCount" content="UserPlays:([0-9,]+)">', + r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', webpage, 'view count', fatal=False)) like_count = str_to_int(self._search_regex( - r'<meta itemprop="interactionCount" content="UserLikes:([0-9,]+)">', + r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', webpage, 'like count', fatal=False)) duration = parse_duration(self._html_search_meta('duration', webpage)) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 95c1abf94..9f2e5d065 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -1,11 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import smuggle_url +from .adobepass import AdobePassIE +from ..utils import ( + smuggle_url, + update_url_query, +) -class FOXIE(InfoExtractor): +class FOXIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.fox.com/watch/255180355939/7684182528', @@ -27,13 +30,26 @@ class FOXIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - release_url = self._parse_json(self._search_regex( - r'"fox_pdk_player"\s*:\s*({[^}]+?})', webpage, 'fox_pdk_player'), - video_id)['release_url'] + '&switch=http' + settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), video_id) + fox_pdk_player = settings['fox_pdk_player'] + release_url = fox_pdk_player['release_url'] + query = { + 'mbr': 'true', + 'switch': 'http' + } + if fox_pdk_player.get('access') == 'locked': + ap_p = settings['foxAdobePassProvider'] + rating = ap_p.get('videoRating') + if rating == 'n/a': + rating = None + resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating) + query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource) return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', - 'url': smuggle_url(release_url, {'force_smil_url': True}), + 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), 'id': video_id, } diff --git a/youtube_dl/extractor/fox9.py b/youtube_dl/extractor/fox9.py new file mode 100644 index 000000000..56d9975d0 --- /dev/null +++ b/youtube_dl/extractor/fox9.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .anvato import AnvatoIE +from ..utils import js_to_json + + +class FOX9IE(AnvatoIE): + _VALID_URL = r'https?://(?:www\.)?fox9\.com/(?:[^/]+/)+(?P<id>\d+)-story' + _TESTS = [{ + 'url': 'http://www.fox9.com/news/215123287-story', + 'md5': 'd6e1b2572c3bab8a849c9103615dd243', + 'info_dict': { + 'id': '314473', + 'ext': 'mp4', + 'title': 'Bear climbs tree in downtown Duluth', + 'description': 'md5:6a36bfb5073a411758a752455408ac90', + 'duration': 51, + 'timestamp': 1478123580, + 'upload_date': '20161102', + 'uploader': 'EPFOX', + 'categories': ['News', 'Sports'], + 'tags': ['news', 'video'], + }, + }, { + 'url': 'http://www.fox9.com/news/investigators/214070684-story', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._parse_json( + self._search_regex( + r'AnvatoPlaylist\s*\(\s*(\[.+?\])\s*\)\s*;', + webpage, 'anvato playlist'), + video_id, transform_source=js_to_json)[0]['video'] + + return self._get_anvato_videos( + 'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b', + video_id) diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index 70c1a815d..e887ae488 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -1,20 +1,26 @@ from __future__ import unicode_literals +import itertools + from .common import InfoExtractor +from ..utils import ( + get_element_by_id, + remove_end, +) class FoxgayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P<id>\d+)\.shtml' _TEST = { 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml', - 'md5': '80d72beab5d04e1655a56ad37afe6841', + 'md5': '344558ccfea74d33b7adbce22e577f54', 'info_dict': { 'id': '2582', 'ext': 'mp4', - 'title': 'md5:6122f7ae0fc6b21ebdf59c5e083ce25a', - 'description': 'md5:5e51dc4405f1fd315f7927daed2ce5cf', + 'title': 'Fuck Turkish-style', + 'description': 'md5:6ae2d9486921891efe89231ace13ffdf', 'age_limit': 18, - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', }, } @@ -22,27 +28,35 @@ class FoxgayIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<title>(?P<title>.*?)', - webpage, 'title', fatal=False) - description = self._html_search_regex( - r'

(?P.*?)

', - webpage, 'description', fatal=False) + title = remove_end(self._html_search_regex( + r'([^<]+)', webpage, 'title'), ' - Foxgay.com') + description = get_element_by_id('inf_tit', webpage) + # The default user-agent with foxgay cookies leads to pages without videos + self._downloader.cookiejar.clear('.foxgay.com') # Find the URL for the iFrame which contains the actual video. + iframe_url = self._html_search_regex( + r']+src=([\'"])(?P[^\'"]+)\1', webpage, + 'video frame', group='url') iframe = self._download_webpage( - self._html_search_regex(r'iframe src="(?P.*?)"', webpage, 'video frame'), - video_id) - video_url = self._html_search_regex( - r"v_path = '(?Phttp://.*?)'", iframe, 'url') - thumb_url = self._html_search_regex( - r"t_path = '(?Phttp://.*?)'", iframe, 'thumbnail', fatal=False) + iframe_url, video_id, headers={'User-Agent': 'curl/7.50.1'}, + note='Downloading video frame') + video_data = self._parse_json(self._search_regex( + r'video_data\s*=\s*([^;]+);', iframe, 'video data'), video_id) + + formats = [{ + 'url': source, + 'height': resolution, + } for source, resolution in zip( + video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] + + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, 'description': description, - 'thumbnail': thumb_url, + 'thumbnail': video_data.get('act_vid', {}).get('thumb'), 'age_limit': 18, } diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index b04da2415..dc0662f74 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals import re from .amp import AMPIE +from .common import InfoExtractor class FoxNewsIE(AMPIE): + IE_NAME = 'foxnews' IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?Pvideo\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://(?Pvideo\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', @@ -20,7 +22,7 @@ class FoxNewsIE(AMPIE): 'duration': 265, 'timestamp': 1304411491, 'upload_date': '20110503', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -34,7 +36,7 @@ class FoxNewsIE(AMPIE): 'duration': 292, 'timestamp': 1417662047, 'upload_date': '20141204', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { # m3u8 download @@ -49,6 +51,11 @@ class FoxNewsIE(AMPIE): 'url': 'http://video.foxbusiness.com/v/4442309889001', 'only_matching': True, }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -58,3 +65,76 @@ class FoxNewsIE(AMPIE): 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) info['id'] = video_id return info + + +class FoxNewsArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P[a-z-]+)' + IE_NAME = 'foxnews:article' + + _TEST = { + 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', + 'md5': '62aa5a781b308fdee212ebb6f33ae7ef', + 'info_dict': { + 'id': '5116295019001', + 'ext': 'mp4', + 'title': 'Trump and Clinton asked to defend positions on Iraq War', + 'description': 'Veterans react on \'The Kelly File\'', + 'timestamp': 1473299755, + 'upload_date': '20160908', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + video_id = self._html_search_regex( + r'data-video-id=([\'"])(?P[^\'"]+)\1', + webpage, 'video ID', group='id') + return self.url_result( + 'http://video.foxnews.com/v/' + video_id, + FoxNewsIE.ie_key()) + + +class FoxNewsInsiderIE(InfoExtractor): + _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P[a-z-]+)' + IE_NAME = 'foxnews:insider' + + _TEST = { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'md5': 'a10c755e582d28120c62749b4feb4c0c', + 'info_dict': { + 'id': '5099377331001', + 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words', + 'ext': 'mp4', + 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive', + 'description': 'Is campus censorship getting out of control?', + 'timestamp': 1472168725, + 'upload_date': '20160825', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': [FoxNewsIE.ie_key()], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL') + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + return { + '_type': 'url_transparent', + 'ie_key': FoxNewsIE.ie_key(), + 'url': embed_url, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index e2ca96283..b98da692c 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -2,104 +2,56 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) from ..utils import ( determine_ext, - int_or_none, - ExtractorError, + unified_strdate, ) class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/player/reecouter\?play=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P[^/?#&]+)' _TEST = { - 'url': 'http://www.franceculture.fr/player/reecouter?play=4795174', + 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { - 'id': '4795174', + 'id': 'rendez-vous-au-pays-des-geeks', + 'display_id': 'rendez-vous-au-pays-des-geeks', 'ext': 'mp3', 'title': 'Rendez-vous au pays des geeks', - 'alt_title': 'Carnet nomade | 13-14', - 'vcodec': 'none', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', - 'thumbnail': r're:^http://static\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$', - 'description': 'startswith:Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche', - 'timestamp': 1393700400, + 'vcodec': 'none', } } - def _extract_from_player(self, url, video_id): - webpage = self._download_webpage(url, video_id) + def _real_extract(self, url): + display_id = self._match_id(url) - video_path = self._search_regex( - r']+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?]+data-asset-source="([^"]+)"', + webpage, 'video path') + + title = self._og_search_title(webpage) + + upload_date = unified_strdate(self._search_regex( + '(?s)]+class="date"[^>]*>.*?]+class="inner"[^>]*>([^<]+)<', webpage, 'upload date', fatal=False)) thumbnail = self._search_regex( - r'\s+]+itemtype="https://schema.org/ImageObject"[^>]*>.*?]+data-dejavu-src="([^"]+)"', webpage, 'thumbnail', fatal=False) - - display_id = self._search_regex( - r'emission-(.*?)', webpage, 'display_id') - - title = self._html_search_regex( - r'(.*?)', webpage, 'title') - alt_title = self._html_search_regex( - r'(.*?)', - webpage, 'alt_title', fatal=False) - description = self._html_search_regex( - r'(.*?)', - webpage, 'description', fatal=False) - uploader = self._html_search_regex( r'(?s)
(.*?)', webpage, 'uploader', default=None) vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None return { - 'id': video_id, + 'id': display_id, + 'display_id': display_id, 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, 'vcodec': vcodec, 'uploader': uploader, - 'timestamp': timestamp, - 'title': title, - 'alt_title': alt_title, - 'thumbnail': thumbnail, - 'description': description, - 'display_id': display_id, + 'upload_date': upload_date, } - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._extract_from_player(url, video_id) - - -class FranceCultureEmissionIE(FranceCultureIE): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emission-(?P[^?#]+)' - _TEST = { - 'url': 'http://www.franceculture.fr/emission-les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13', - 'info_dict': { - 'title': 'Jean-Gabriel Périot, cinéaste', - 'alt_title': 'Les Carnets de la création', - 'id': '5093239', - 'display_id': 'les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13', - 'ext': 'mp3', - 'timestamp': 1444762500, - 'upload_date': '20151013', - 'description': 'startswith:Aujourd\'hui dans "Les carnets de la création", le cinéaste', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_path = self._html_search_regex( - r'[0-9]+)', video_path, 'new_id', group='id') - video_url = compat_urlparse.urljoin(url, video_path) - return self._extract_from_player(video_url, new_id) diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 2369f868d..707b9e00d 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -2,21 +2,21 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import month_by_name class FranceInterIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P[^?#]+)' + _TEST = { - 'url': 'http://www.franceinter.fr/player/reecouter?play=793962', - 'md5': '4764932e466e6f6c79c317d2e74f6884', + 'url': 'https://www.franceinter.fr/emissions/affaires-sensibles/affaires-sensibles-07-septembre-2016', + 'md5': '9e54d7bdb6fdc02a841007f8a975c094', 'info_dict': { - 'id': '793962', + 'id': 'affaires-sensibles/affaires-sensibles-07-septembre-2016', 'ext': 'mp3', - 'title': 'L’Histoire dans les jeux vidéo', - 'description': 'md5:7e93ddb4451e7530022792240a3049c7', - 'timestamp': 1387369800, - 'upload_date': '20131218', + 'title': 'Affaire Cahuzac : le contentieux du compte en Suisse', + 'description': 'md5:401969c5d318c061f86bda1fa359292b', + 'upload_date': '20160907', }, } @@ -25,23 +25,30 @@ class FranceInterIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - path = self._search_regex( - r']+class=["\']page-diffusion["\'][^>]*>.*?]+data-url=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'video url', group='url') - title = self._html_search_regex( - r'(.+?)', webpage, 'title') - description = self._html_search_regex( - r'(.*?)', - webpage, 'description', fatal=False) - timestamp = int_or_none(self._search_regex( - r'data-date="(\d+)"', webpage, 'upload date', fatal=False)) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + upload_date_str = self._search_regex( + r'class=["\']cover-emission-period["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', + webpage, 'upload date', fatal=False) + if upload_date_str: + upload_date_list = upload_date_str.split() + upload_date_list.reverse() + upload_date_list[1] = '%02d' % (month_by_name(upload_date_list[1], lang='fr') or 0) + upload_date_list[2] = '%02d' % int(upload_date_list[2]) + upload_date = ''.join(upload_date_list) + else: + upload_date = None return { 'id': video_id, 'title': title, 'description': description, - 'timestamp': timestamp, + 'upload_date': upload_date, 'formats': [{ 'url': video_url, 'vcodec': 'none', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 7653975e3..48d43ae58 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals @@ -131,7 +131,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P.+)\.html' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -168,7 +168,7 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'id': 'NI_173343', 'ext': 'mp4', 'title': 'Les entreprises familiales : le secret de la réussite', - 'thumbnail': 're:^https?://.*\.jpe?g$', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'timestamp': 1433273139, 'upload_date': '20150602', }, @@ -184,7 +184,7 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'ext': 'mp4', 'title': 'Olivier Monthus, réalisateur de "Bretagne, le choix de l’Armor"', 'description': 'md5:a3264114c9d29aeca11ced113c37b16c', - 'thumbnail': 're:^https?://.*\.jpe?g$', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'timestamp': 1458300695, 'upload_date': '20160318', }, @@ -206,6 +206,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'uploader_id': 'x2q2ez', }, 'add_ie': ['Dailymotion'], + }, { + 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/freesound.py b/youtube_dl/extractor/freesound.py index 5ff62af2a..138b6bc58 100644 --- a/youtube_dl/extractor/freesound.py +++ b/youtube_dl/extractor/freesound.py @@ -3,10 +3,16 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + float_or_none, + get_element_by_class, + get_element_by_id, + unified_strdate, +) class FreesoundIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?freesound\.org/people/[^/]+/sounds/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.freesound.org/people/miklovan/sounds/194503/', 'md5': '12280ceb42c81f19a515c745eae07650', @@ -14,26 +20,60 @@ class FreesoundIE(InfoExtractor): 'id': '194503', 'ext': 'mp3', 'title': 'gulls in the city.wav', - 'uploader': 'miklovan', 'description': 'the sounds of seagulls in the city', + 'duration': 130.233, + 'uploader': 'miklovan', + 'upload_date': '20130715', + 'tags': list, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - music_id = mobj.group('id') - webpage = self._download_webpage(url, music_id) - title = self._html_search_regex( - r'<div id="single_sample_header">.*?<a href="#">(.+?)</a>', - webpage, 'music title', flags=re.DOTALL) + audio_id = self._match_id(url) + + webpage = self._download_webpage(url, audio_id) + + audio_url = self._og_search_property('audio', webpage, 'song url') + title = self._og_search_property('audio:title', webpage, 'song title') + description = self._html_search_regex( - r'<div id="sound_description">(.*?)</div>', webpage, 'description', - fatal=False, flags=re.DOTALL) + r'(?s)id=["\']sound_description["\'][^>]*>(.+?)</div>', + webpage, 'description', fatal=False) + + duration = float_or_none( + get_element_by_class('duration', webpage), scale=1000) + + upload_date = unified_strdate(get_element_by_id('sound_date', webpage)) + uploader = self._og_search_property( + 'audio:artist', webpage, 'uploader', fatal=False) + + channels = self._html_search_regex( + r'Channels</dt><dd>(.+?)</dd>', webpage, + 'channels info', fatal=False) + + tags_str = get_element_by_class('tags', webpage) + tags = re.findall(r'<a[^>]+>([^<]+)', tags_str) if tags_str else None + + audio_urls = [audio_url] + + LQ_FORMAT = '-lq.mp3' + if LQ_FORMAT in audio_url: + audio_urls.append(audio_url.replace(LQ_FORMAT, '-hq.mp3')) + + formats = [{ + 'url': format_url, + 'format_note': channels, + 'quality': quality, + } for quality, format_url in enumerate(audio_urls)] + self._sort_formats(formats) return { - 'id': music_id, + 'id': audio_id, 'title': title, - 'url': self._og_search_property('audio', webpage, 'music url'), - 'uploader': self._og_search_property('audio:artist', webpage, 'music uploader'), 'description': description, + 'duration': duration, + 'uploader': uploader, + 'upload_date': upload_date, + 'tags': tags, + 'formats': formats, } diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py index 1477708bb..0a70ca763 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/youtube_dl/extractor/freespeech.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class FreespeechIE(InfoExtractor): IE_NAME = 'freespeech.org' - _VALID_URL = r'https://www\.freespeech\.org/video/(?P<title>.+)' + _VALID_URL = r'https?://(?:www\.)?freespeech\.org/video/(?P<title>.+)' _TEST = { 'add_ie': ['Youtube'], 'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0', diff --git a/youtube_dl/extractor/freevideo.py b/youtube_dl/extractor/freevideo.py deleted file mode 100644 index cd8423a6f..000000000 --- a/youtube_dl/extractor/freevideo.py +++ /dev/null @@ -1,38 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class FreeVideoIE(InfoExtractor): - _VALID_URL = r'^https?://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' - - _TEST = { - 'url': 'http://www.freevideo.cz/vase-videa/vysukany-zadecek-22033.html', - 'info_dict': { - 'id': 'vysukany-zadecek-22033', - 'ext': 'mp4', - 'title': 'vysukany-zadecek-22033', - 'age_limit': 18, - }, - 'skip': 'Blocked outside .cz', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage, handle = self._download_webpage_handle(url, video_id) - if '//www.czechav.com/' in handle.geturl(): - raise ExtractorError( - 'Access to freevideo is blocked from your location', - expected=True) - - video_url = self._search_regex( - r'\s+url: "(http://[a-z0-9-]+.cdn.freevideo.cz/stream/.*?/video.mp4)"', - webpage, 'video URL') - - return { - 'id': video_id, - 'url': video_url, - 'title': video_id, - 'age_limit': 18, - } diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 0ad0d9b6a..eba00cd5a 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -29,7 +29,7 @@ class FunimationIE(InfoExtractor): 'ext': 'mp4', 'title': 'Air - 1 - Breeze', 'description': 'md5:1769f43cd5fc130ace8fd87232207892', - 'thumbnail': 're:https?://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', }, 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed', }, { @@ -40,7 +40,7 @@ class FunimationIE(InfoExtractor): 'ext': 'mp4', 'title': '.hack//SIGN - 1 - Role Play', 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', - 'thumbnail': 're:https?://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', }, 'skip': 'Access without user interaction is forbidden by CloudFlare', }, { @@ -51,7 +51,7 @@ class FunimationIE(InfoExtractor): 'ext': 'mp4', 'title': 'Attack on Titan: Junior High - Broadcast Dub Preview', 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', - 'thumbnail': 're:https?://.*\.(?:jpg|png)', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', }, 'skip': 'Access without user interaction is forbidden by CloudFlare', }] diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 8c5ffc9e8..81c0ce9a3 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -17,7 +17,7 @@ class FunnyOrDieIE(InfoExtractor): 'ext': 'mp4', 'title': 'Heart-Shaped Box: Literal Video Version', 'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338', - 'thumbnail': 're:^http:.*\.jpg$', + 'thumbnail': r're:^http:.*\.jpg$', }, }, { 'url': 'http://www.funnyordie.com/embed/e402820827', @@ -26,7 +26,10 @@ class FunnyOrDieIE(InfoExtractor): 'ext': 'mp4', 'title': 'Please Use This Song (Jon Lajoie)', 'description': 'Please use this to sell something. www.jonlajoie.com', - 'thumbnail': 're:^http:.*\.jpg$', + 'thumbnail': r're:^http:.*\.jpg$', + }, + 'params': { + 'skip_download': True, }, }, { 'url': 'http://www.funnyordie.com/articles/ebf5e34fc8/10-hours-of-walking-in-nyc-as-a-man', @@ -51,19 +54,45 @@ class FunnyOrDieIE(InfoExtractor): formats = [] - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + source_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) - bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)[,/]', m3u8_url)] + bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)] bitrates.sort() - for bitrate in bitrates: - for link in links: - formats.append({ - 'url': self._proto_relative_url('%s%d.%s' % (link[0], bitrate, link[1])), - 'format_id': '%s-%d' % (link[1], bitrate), - 'vbr': bitrate, - }) + if source_formats: + self._sort_formats(source_formats) + + for bitrate, f in zip(bitrates, source_formats or [{}] * len(bitrates)): + for path, ext in links: + ff = f.copy() + if ff: + if ext != 'mp4': + ff = dict( + [(k, v) for k, v in ff.items() + if k in ('height', 'width', 'format_id')]) + ff.update({ + 'format_id': ff['format_id'].replace('hls', ext), + 'ext': ext, + 'protocol': 'http', + }) + else: + ff.update({ + 'format_id': '%s-%d' % (ext, bitrate), + 'vbr': bitrate, + }) + ff['url'] = self._proto_relative_url( + '%s%d.%s' % (path, bitrate, ext)) + formats.append(ff) + self._check_formats(formats, video_id) + + formats.extend(m3u8_formats) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) subtitles = {} for src, src_lang in re.findall(r'<track kind="captions" src="([^"]+)" srclang="([^"]+)"', webpage): diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py index b4ab4cbb7..ede729b52 100644 --- a/youtube_dl/extractor/fusion.py +++ b/youtube_dl/extractor/fusion.py @@ -29,7 +29,7 @@ class FusionIE(InfoExtractor): webpage = self._download_webpage(url, display_id) ooyala_code = self._search_regex( - r'data-video-id=(["\'])(?P<code>.+?)\1', + r'data-ooyala-id=(["\'])(?P<code>(?:(?!\1).)+)\1', webpage, 'ooyala code', group='code') return OoyalaIE._build_url_result(ooyala_code) diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py new file mode 100644 index 000000000..629897317 --- /dev/null +++ b/youtube_dl/extractor/fxnetworks.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .adobepass import AdobePassIE +from ..utils import ( + update_url_query, + extract_attributes, + parse_age_limit, + smuggle_url, +) + + +class FXNetworksIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.fxnetworks.com/video/719841347694', + 'md5': '1447d4722e42ebca19e5232ab93abb22', + 'info_dict': { + 'id': '719841347694', + 'ext': 'mp4', + 'title': 'Vanpage', + 'description': 'F*ck settling down. You\'re the Worst returns for an all new season August 31st on FXX.', + 'age_limit': 14, + 'uploader': 'NEWA-FNG-FX', + 'upload_date': '20160706', + 'timestamp': 1467844741, + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.simpsonsworld.com/video/716094019682', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + if 'The content you are trying to access is not available in your region.' in webpage: + self.raise_geo_restricted() + video_data = extract_attributes(self._search_regex( + r'(<a.+?rel="http://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) + player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) + release_url = video_data['rel'] + title = video_data['data-title'] + rating = video_data.get('data-rating') + query = { + 'mbr': 'true', + } + if player_type == 'movies': + query.update({ + 'manifest': 'm3u', + }) + else: + query.update({ + 'switch': 'http', + }) + if video_data.get('data-req-auth') == '1': + resource = self._get_mvpd_resource( + video_data['data-channel'], title, + video_data.get('data-guid'), rating) + query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': title, + 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + 'thumbnail': video_data.get('data-large-thumb'), + 'age_limit': parse_age_limit(rating), + 'ie_key': 'ThePlatform', + } diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py index d545e01bb..a218a6944 100644 --- a/youtube_dl/extractor/gamersyde.py +++ b/youtube_dl/extractor/gamersyde.py @@ -20,7 +20,7 @@ class GamersydeIE(InfoExtractor): 'ext': 'mp4', 'duration': 372, 'title': 'Bloodborne - Birth of a hero', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } } diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 4e859e09a..682c49e79 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -63,7 +63,7 @@ class GameSpotIE(OnceIE): streams, ('progressive_hd', 'progressive_high', 'progressive_low')) if progressive_url and manifest_url: qualities_basename = self._search_regex( - '/([^/]+)\.csmil/', + r'/([^/]+)\.csmil/', manifest_url, 'qualities basename', default=None) if qualities_basename: QUALITIES_RE = r'((,\d+)+,?)' diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index 69058a583..e607d6ab8 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -1,19 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, - parse_duration, - str_to_int, - unified_strdate, + remove_end, ) class GameStarIE(InfoExtractor): - _VALID_URL = r'https?://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' _TEST = { 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', 'md5': '96974ecbb7fd8d0d20fca5a00810cea7', @@ -21,8 +17,9 @@ class GameStarIE(InfoExtractor): 'id': '76110', 'ext': 'mp4', 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', - 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den vollständigen Trailer an.', - 'thumbnail': 'http://images.gamestar.de/images/idgwpgsgp/bdb/2494525/600x.jpg', + 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1406542020, 'upload_date': '20140728', 'duration': 17 } @@ -32,41 +29,27 @@ class GameStarIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - og_title = self._og_search_title(webpage) - title = re.sub(r'\s*- Video (bei|-) GameStar\.de$', '', og_title) - url = 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id - description = self._og_search_description(webpage).strip() - - thumbnail = self._proto_relative_url( - self._og_search_thumbnail(webpage), scheme='http:') - - upload_date = unified_strdate(self._html_search_regex( - r'<span style="float:left;font-size:11px;">Datum: ([0-9]+\.[0-9]+\.[0-9]+)  ', - webpage, 'upload_date', fatal=False)) - - duration = parse_duration(self._html_search_regex( - r'  Länge: ([0-9]+:[0-9]+)</span>', webpage, 'duration', - fatal=False)) - - view_count = str_to_int(self._html_search_regex( - r'  Zuschauer: ([0-9\.]+)  ', webpage, - 'view_count', fatal=False)) + # TODO: there are multiple ld+json objects in the webpage, + # while _search_json_ld finds only the first one + json_ld = self._parse_json(self._search_regex( + r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>[^<]+VideoObject[^<]+)</script>', + webpage, 'JSON-LD', group='json_ld'), video_id) + info_dict = self._json_ld(json_ld, video_id) + info_dict['title'] = remove_end(info_dict['title'], ' - GameStar') + view_count = json_ld.get('interactionCount') comment_count = int_or_none(self._html_search_regex( - r'>Kommentieren \(([0-9]+)\)</a>', webpage, 'comment_count', + r'([0-9]+) Kommentare</span>', webpage, 'comment_count', fatal=False)) - return { + info_dict.update({ 'id': video_id, - 'title': title, 'url': url, 'ext': 'mp4', - 'thumbnail': thumbnail, - 'description': description, - 'upload_date': upload_date, - 'duration': duration, 'view_count': view_count, 'comment_count': comment_count - } + }) + + return info_dict diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py index 18ef5c252..57c67a451 100644 --- a/youtube_dl/extractor/gazeta.py +++ b/youtube_dl/extractor/gazeta.py @@ -16,7 +16,7 @@ class GazetaIE(InfoExtractor): 'ext': 'mp4', 'title': '«70–80 процентов гражданских в Донецке на грани голода»', 'description': 'md5:38617526050bd17b234728e7f9620a71', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', }, 'skip': 'video not found', }, { diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e89a03760..a23486620 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals @@ -27,7 +27,6 @@ from ..utils import ( unified_strdate, unsmuggle_url, UnsupportedError, - url_basename, xpath_text, ) from .brightcove import ( @@ -48,6 +47,8 @@ from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE from .tnaflix import TNAFlixNetworkEmbedIE +from .drtuber import DrTuberIE +from .redtube import RedTubeIE from .vimeo import VimeoIE from .dailymotion import ( DailymotionIE, @@ -55,10 +56,10 @@ from .dailymotion import ( ) from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE -from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE from .pladform import PladformIE from .videomore import VideomoreIE +from .webcaster import WebcasterFeedIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE @@ -72,6 +73,14 @@ from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE from .soundcloud import SoundcloudIE +from .tunein import TuneInBaseIE +from .vbox7 import Vbox7IE +from .dbtv import DBTVIE +from .piksel import PikselIE +from .videa import VideaIE +from .twentymin import TwentyMinutenIE +from .ustream import UstreamIE +from .openload import OpenloadIE class GenericIE(InfoExtractor): @@ -102,7 +111,8 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' - ] + ], + 'skip': 'URL invalid', }, # Direct download with broken HEAD { @@ -232,7 +242,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tikibad ontruimd wegens brand', 'description': 'md5:05ca046ff47b931f9b04855015e163a4', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 33, }, 'params': { @@ -266,7 +276,8 @@ class GenericIE(InfoExtractor): 'params': { # m3u8 downloads 'skip_download': True, - } + }, + 'skip': 'video gone', }, # m3u8 served with Content-Type: text/plain { @@ -281,7 +292,8 @@ class GenericIE(InfoExtractor): 'params': { # m3u8 downloads 'skip_download': True, - } + }, + 'skip': 'video gone', }, # google redirect { @@ -291,7 +303,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20130224', 'uploader_id': 'TheVerge', - 'description': 're:^Chris Ziegler takes a look at the\.*', + 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, @@ -337,10 +349,10 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' in the - # http requests { + # embedded brightcove video + # it also tests brightcove videos that need to set the 'Referer' + # in the http requests 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 'info_dict': { @@ -354,6 +366,24 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # embedded with itemprop embedURL and video id spelled as `idVideo` + 'add_id': ['BrightcoveLegacy'], + 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', + 'info_dict': { + 'id': '5255628253001', + 'ext': 'mp4', + 'title': 'md5:37c519b1128915607601e75a87995fc0', + 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', + 'uploader': 'BFM BUSINESS', + 'uploader_id': '876450612001', + 'timestamp': 1482255315, + 'upload_date': '20161220', + }, + 'params': { + 'skip_download': True, + }, + }, { # https://github.com/rg3/youtube-dl/issues/2253 'url': 'http://bcove.me/i6nfkrc3', @@ -366,6 +396,7 @@ class GenericIE(InfoExtractor): 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', }, 'add_ie': ['BrightcoveLegacy'], + 'skip': 'video gone', }, { 'url': 'http://www.championat.com/video/football/v/87/87499.html', @@ -394,6 +425,26 @@ class GenericIE(InfoExtractor): 'skip_download': True, # m3u8 download }, }, + { + # Brightcove with alternative playerID key + 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', + 'info_dict': { + 'id': 'nmeth.2062_SV1', + 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', + }, + 'playlist': [{ + 'info_dict': { + 'id': '2228375078001', + 'ext': 'mp4', + 'title': 'nmeth.2062-sv1', + 'description': 'nmeth.2062-sv1', + 'timestamp': 1363357591, + 'upload_date': '20130315', + 'uploader': 'Nature Publishing Group', + 'uploader_id': '1964492299001', + }, + }], + }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -419,6 +470,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'movie expired', }, # embed.ly video { @@ -446,6 +498,8 @@ class GenericIE(InfoExtractor): 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama', 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, + # HEAD requests lead to endless 301, while GET is OK + 'expected_warnings': ['301'], }, # RUTV embed { @@ -508,7 +562,7 @@ class GenericIE(InfoExtractor): 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e', 'ext': 'mp4', 'title': 'Ужастики, русский трейлер (2015)', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 153, } }, @@ -520,6 +574,9 @@ class GenericIE(InfoExtractor): 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', }, 'playlist_mincount': 7, + # This forum does not allow <iframe> syntaxes anymore + # Now HTML tags are displayed as-is + 'skip': 'No videos on this page', }, # Embedded TED video { @@ -533,17 +590,6 @@ class GenericIE(InfoExtractor): 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, - # Embedded Ustream video - { - 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm', - 'md5': '27b99cdb639c9b12a79bca876a073417', - 'info_dict': { - 'id': '45734260', - 'ext': 'flv', - 'uploader': 'AU SPA: The NSA and Privacy', - 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman' - } - }, # nowvideo embed hidden behind percent encoding { 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/', @@ -568,7 +614,8 @@ class GenericIE(InfoExtractor): }, 'params': { 'skip_download': 'Requires rtmpdump' - } + }, + 'skip': 'video gone', }, # francetv embed { @@ -724,7 +771,7 @@ class GenericIE(InfoExtractor): 'duration': 48, 'timestamp': 1401537900, 'upload_date': '20140531', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, # Wistia embed @@ -794,6 +841,21 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 7, }, + # TuneIn station embed + { + 'url': 'http://radiocnrv.com/promouvoir-radio-cnrv/', + 'info_dict': { + 'id': '204146', + 'ext': 'mp3', + 'title': 'CNRV', + 'location': 'Paris, France', + 'is_live': True, + }, + 'params': { + # Live stream + 'skip_download': True, + }, + }, # Livestream embed { 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', @@ -958,6 +1020,20 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + { + # Kaltura embedded, some fileExt broken (#11480) + 'url': 'http://www.cornell.edu/video/nima-arkani-hamed-standard-models-of-particle-physics', + 'info_dict': { + 'id': '1_sgtvehim', + 'ext': 'mp4', + 'title': 'Our "Standard Models" of particle physics and cosmology', + 'description': 'md5:67ea74807b8c4fea92a6f38d6d323861', + 'timestamp': 1321158993, + 'upload_date': '20111113', + 'uploader_id': 'kps1', + }, + 'add_ie': ['Kaltura'], + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -967,7 +1043,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Навальный вышел на свободу', 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 87, 'view_count': int, 'age_limit': 0, @@ -981,7 +1057,7 @@ class GenericIE(InfoExtractor): 'id': '12820', 'ext': 'mp4', 'title': "'O Sole Mio", - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 216, 'view_count': int, }, @@ -994,7 +1070,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 694, 'age_limit': 0, }, @@ -1006,7 +1082,7 @@ class GenericIE(InfoExtractor): 'id': '3519514', 'ext': 'mp4', 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer', - 'thumbnail': 're:^https?://.*\.png$', + 'thumbnail': r're:^https?://.*\.png$', 'duration': 45.115, }, }, @@ -1089,7 +1165,7 @@ class GenericIE(InfoExtractor): 'id': '300346', 'ext': 'mp4', 'title': '中一中男師變性 全校師生力挺', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { # m3u8 download @@ -1135,7 +1211,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sauvons les abeilles ! - Le débat', 'description': 'md5:d9082128b1c5277987825d684939ca26', - 'thumbnail': 're:^https?://.*\.jpe?g$', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'timestamp': 1434970506, 'upload_date': '20150622', 'uploader': 'Public Sénat', @@ -1149,7 +1225,7 @@ class GenericIE(InfoExtractor): 'id': '2855', 'ext': 'mp4', 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', - 'thumbnail': 're:^https?://.*\.jpe?g$', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'uploader': 'ClickHole', 'uploader_id': 'clickhole', } @@ -1175,16 +1251,6 @@ class GenericIE(InfoExtractor): 'duration': 248.667, }, }, - # ScreenwaveMedia embed - { - 'url': 'http://www.thecinemasnob.com/the-cinema-snob/a-nightmare-on-elm-street-2-freddys-revenge1', - 'md5': '24ace5baba0d35d55c6810b51f34e9e0', - 'info_dict': { - 'id': 'cinemasnob-55d26273809dd', - 'ext': 'mp4', - 'title': 'cinemasnob', - }, - }, # BrightcoveInPageEmbed embed { 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', @@ -1196,20 +1262,6 @@ class GenericIE(InfoExtractor): 'duration': 51690, }, }, - # JWPlayer with M3U8 - { - 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', - 'info_dict': { - 'id': 'playlist', - 'ext': 'mp4', - 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', - 'uploader': 'ren.tv', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - } - }, # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm { @@ -1356,6 +1408,11 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Vimeo'], }, + { + # generic vimeo embed that requires original URL passed as Referer + 'url': 'http://racing4everyone.eu/2016/07/30/formula-1-2016-round12-germany/', + 'only_matching': True, + }, { 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', @@ -1373,6 +1430,50 @@ class GenericIE(InfoExtractor): }, 'add_ie': [ArkenaIE.ie_key()], }, + { + 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', + 'info_dict': { + 'id': '1c7141f46c', + 'ext': 'mp4', + 'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [Vbox7IE.ie_key()], + }, + { + # DBTV embeds + 'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/', + 'info_dict': { + 'id': '43254897', + 'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans', + }, + 'playlist_mincount': 3, + }, + { + # Videa embeds + 'url': 'http://forum.dvdtalk.com/movie-talk/623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style.html', + 'info_dict': { + 'id': '623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style', + 'title': 'Deleted Magic - Star Wars: OT Deleted / Alt. Scenes Docu. Style - DVD Talk Forum', + }, + 'playlist_mincount': 2, + }, + { + # 20 minuten embed + 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', + 'info_dict': { + 'id': '523629', + 'ext': 'mp4', + 'title': 'So kommen Sie bei Eis und Schnee sicher an', + 'description': 'md5:117c212f64b25e3d95747e5276863f7d', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TwentyMinutenIE.ie_key()], + } # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1510,7 +1611,7 @@ class GenericIE(InfoExtractor): force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: - video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + video_id = self._generic_id(url) self.to_screen('%s: Requesting header' % video_id) @@ -1539,7 +1640,7 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, - 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + 'title': self._generic_title(url), 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) } @@ -1610,6 +1711,10 @@ class GenericIE(InfoExtractor): doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) + elif doc.tag == 'SmoothStreamingMedia': + info_dict['formats'] = self._parse_ism_formats(doc, url) + self._sort_formats(info_dict['formats']) + return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): smil = self._parse_smil(doc, url, video_id) self._sort_formats(smil['formats']) @@ -1618,7 +1723,9 @@ class GenericIE(InfoExtractor): return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( - doc, video_id, mpd_base_url=url.rpartition('/')[0]) + doc, video_id, + mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): @@ -1713,9 +1820,9 @@ class GenericIE(InfoExtractor): if matches: return _playlist_from_matches(matches, ie='RtlNl') - vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) - if vimeo_url is not None: - return self.url_result(vimeo_url) + vimeo_urls = VimeoIE._extract_urls(url, webpage) + if vimeo_urls: + return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key()) vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', @@ -1858,7 +1965,14 @@ class GenericIE(InfoExtractor): re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: - return OoyalaIE._build_url_result(smuggle_url(mobj.group('ec'), {'domain': url})) + embed_token = self._search_regex( + r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', + webpage, 'ooyala embed token', default=None) + return OoyalaIE._build_url_result(smuggle_url( + mobj.group('ec'), { + 'domain': url, + 'embed_token': embed_token, + })) # Look for multiple Ooyala embeds on SBN network websites mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) @@ -1951,11 +2065,6 @@ class GenericIE(InfoExtractor): if sportbox_urls: return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') - # Look for embedded PornHub player - pornhub_url = PornHubIE._extract_url(webpage) - if pornhub_url: - return self.url_result(pornhub_url, 'PornHub') - # Look for embedded XHamster player xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) if xhamster_urls: @@ -1966,6 +2075,21 @@ class GenericIE(InfoExtractor): if tnaflix_urls: return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) + # Look for embedded PornHub player + pornhub_urls = PornHubIE._extract_urls(webpage) + if pornhub_urls: + return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key()) + + # Look for embedded DrTuber player + drtuber_urls = DrTuberIE._extract_urls(webpage) + if drtuber_urls: + return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key()) + + # Look for embedded RedTube player + redtube_urls = RedTubeIE._extract_urls(webpage) + if redtube_urls: + return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) @@ -1979,10 +2103,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'TED') # Look for embedded Ustream videos - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Ustream') + ustream_url = UstreamIE._extract_url(webpage) + if ustream_url: + return self.url_result(ustream_url, UstreamIE.ie_key()) # Look for embedded arte.tv player mobj = re.search( @@ -2013,6 +2136,11 @@ class GenericIE(InfoExtractor): if soundcloud_urls: return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) + # Look for tunein player + tunein_urls = TuneInBaseIE._extract_urls(webpage) + if tunein_urls: + return _playlist_from_matches(tunein_urls) + # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) if mtvservices_url: @@ -2098,6 +2226,11 @@ class GenericIE(InfoExtractor): if videomore_url: return self.url_result(videomore_url) + # Look for Webcaster embeds + webcaster_url = WebcasterFeedIE._extract_url(self, webpage) + if webcaster_url: + return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key()) + # Look for Playwire embeds mobj = re.search( r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage) @@ -2164,11 +2297,6 @@ class GenericIE(InfoExtractor): if jwplatform_url: return self.url_result(jwplatform_url, 'JWPlatform') - # Look for ScreenwaveMedia embeds - mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage) - if mobj is not None: - return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia') - # Look for Digiteka embeds digiteka_url = DigitekaIE._extract_url(webpage) if digiteka_url: @@ -2179,6 +2307,11 @@ class GenericIE(InfoExtractor): if arkena_url: return self.url_result(arkena_url, ArkenaIE.ie_key()) + # Look for Piksel embeds + piksel_url = PikselIE._extract_url(webpage) + if piksel_url: + return self.url_result(piksel_url, PikselIE.ie_key()) + # Look for Limelight embeds mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) if mobj: @@ -2190,6 +2323,16 @@ class GenericIE(InfoExtractor): return self.url_result('limelight:%s:%s' % ( lm[mobj.group(1)], mobj.group(2)), 'Limelight%s' % mobj.group(1), mobj.group(2)) + mobj = re.search( + r'''(?sx) + <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*? + <param[^>]+ + name=(["\'])flashVars\2[^>]+ + value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32}) + ''', webpage) + if mobj: + return self.url_result('limelight:media:%s' % mobj.group('id')) + # Look for AdobeTVVideo embeds mobj = re.search( r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', @@ -2209,11 +2352,40 @@ class GenericIE(InfoExtractor): # Look for VODPlatform embeds mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vod-platform\.net/embed/[^/?#]+)', + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vod-platform\.net/[eE]mbed/.+?)\1', webpage) if mobj is not None: return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), 'VODPlatform') + self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') + + # Look for Mangomolo embeds + mobj = re.search( + r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?admin\.mangomolo\.com/analytics/index\.php/customers/embed/ + (?: + video\?.*?\bid=(?P<video_id>\d+)| + index\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) + ).+?)\1''', webpage) + if mobj is not None: + info = { + '_type': 'url_transparent', + 'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))), + 'title': video_title, + 'description': video_description, + 'thumbnail': video_thumbnail, + 'uploader': video_uploader, + } + video_id = mobj.group('video_id') + if video_id: + info.update({ + 'ie_key': 'MangomoloVideo', + 'id': video_id, + }) + else: + info.update({ + 'ie_key': 'MangomoloLive', + 'id': mobj.group('channel_id'), + }) + return info # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) @@ -2239,10 +2411,37 @@ class GenericIE(InfoExtractor): 'uploader': video_uploader, } + # Look for VBOX7 embeds + vbox7_url = Vbox7IE._extract_url(webpage) + if vbox7_url: + return self.url_result(vbox7_url, Vbox7IE.ie_key()) + + # Look for DBTV embeds + dbtv_urls = DBTVIE._extract_urls(webpage) + if dbtv_urls: + return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key()) + + # Look for Videa embeds + videa_urls = VideaIE._extract_urls(webpage) + if videa_urls: + return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key()) + + # Look for 20 minuten embeds + twentymin_urls = TwentyMinutenIE._extract_urls(webpage) + if twentymin_urls: + return _playlist_from_matches( + twentymin_urls, ie=TwentyMinutenIE.ie_key()) + + # Look for Openload embeds + openload_urls = OpenloadIE._extract_urls(webpage) + if openload_urls: + return _playlist_from_matches( + openload_urls, ie=OpenloadIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( - webpage, video_id, default=None, expected_type='VideoObject') - if json_ld and json_ld.get('url'): + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): info_dict.update({ 'title': video_title or info_dict['title'], 'description': video_description, @@ -2252,12 +2451,23 @@ class GenericIE(InfoExtractor): info_dict.update(json_ld) return info_dict + # Look for HTML5 media + entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') + if entries: + for entry in entries: + entry.update({ + 'id': video_id, + 'title': video_title, + }) + self._sort_formats(entry['formats']) + return self.playlist_result(entries) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True vpath = compat_urlparse.urlparse(vurl).path vext = determine_ext(vpath) - return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml') + return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js') def filter_video(urls): return list(filter(check_video, urls)) @@ -2307,9 +2517,6 @@ class GenericIE(InfoExtractor): # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)) - if not found: - # HTML5 video - found = re.findall(r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage) if not found: REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' found = re.search( @@ -2376,6 +2583,21 @@ class GenericIE(InfoExtractor): entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) + elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: + # Just matching .ism/manifest is not enough to be reliably sure + # whether it's actually an ISM manifest or some other streaming + # manifest since there are various streaming URL formats + # possible (see [1]) as well as some other shenanigans like + # .smil/manifest URLs that actually serve an ISM (see [2]) and + # so on. + # Thus the most reasonable way to solve this is to delegate + # to generic extractor in order to look into the contents of + # the manifest itself. + # 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats + # 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest + entry_info_dict = self.url_result( + smuggle_url(video_url, {'to_generic': True}), + GenericIE.ie_key()) else: entry_info_dict['url'] = video_url diff --git a/youtube_dl/extractor/giantbomb.py b/youtube_dl/extractor/giantbomb.py index 87cd19147..29b684d35 100644 --- a/youtube_dl/extractor/giantbomb.py +++ b/youtube_dl/extractor/giantbomb.py @@ -23,7 +23,7 @@ class GiantBombIE(InfoExtractor): 'title': 'Quick Look: Destiny: The Dark Below', 'description': 'md5:0aa3aaf2772a41b91d44c63f30dfad24', 'duration': 2399, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } } diff --git a/youtube_dl/extractor/giga.py b/youtube_dl/extractor/giga.py index 28eb733e2..5a9992a27 100644 --- a/youtube_dl/extractor/giga.py +++ b/youtube_dl/extractor/giga.py @@ -24,7 +24,7 @@ class GigaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Anime Awesome: Chihiros Reise ins Zauberland – Das Beste kommt zum Schluss', 'description': 'md5:afdf5862241aded4718a30dff6a57baf', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 578, 'timestamp': 1414749706, 'upload_date': '20141031', diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 62ff84835..d94dfbf09 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate class GlideIE(InfoExtractor): @@ -14,10 +13,8 @@ class GlideIE(InfoExtractor): 'info_dict': { 'id': 'UZF8zlmuQbe4mr+7dCiQ0w==', 'ext': 'mp4', - 'title': 'Damon Timm\'s Glide message', - 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', - 'uploader': 'Damon Timm', - 'upload_date': '20140919', + 'title': "Damon's Glide message", + 'thumbnail': r're:^https?://.*?\.cloudfront\.net/.*\.jpg$', } } @@ -27,7 +24,8 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex( - r'<title>(.+?)', webpage, 'title') + r'(.+?)', webpage, + 'title', default=None) or self._og_search_title(webpage) video_url = self._proto_relative_url(self._search_regex( r']+src=(["\'])(?P.+?)\1', webpage, 'video URL', default=None, @@ -36,18 +34,10 @@ class GlideIE(InfoExtractor): r']+id=["\']video-thumbnail["\'][^>]+src=(["\'])(?P.+?)\1', webpage, 'thumbnail url', default=None, group='url')) or self._og_search_thumbnail(webpage) - uploader = self._search_regex( - r']+class=["\']info-name["\'][^>]*>([^<]+)', - webpage, 'uploader', fatal=False) - upload_date = unified_strdate(self._search_regex( - r']+class="info-date"[^>]*>([^<]+)', - webpage, 'upload date', fatal=False)) return { 'id': video_id, 'title': title, 'url': video_url, 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 3de8356f6..dc7b2661c 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import random +import re import math from .common import InfoExtractor @@ -14,12 +15,13 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + orderedSet, str_or_none, ) class GloboIE(InfoExtractor): - _VALID_URL = '(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' + _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' @@ -63,6 +65,9 @@ class GloboIE(InfoExtractor): }, { 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html', 'only_matching': True, + }, { + 'url': 'globo:3607726', + 'only_matching': True, }] class MD5(object): @@ -396,33 +401,41 @@ class GloboIE(InfoExtractor): class GloboArticleIE(InfoExtractor): - _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P[^/]+)\.html' + _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P[^/.]+)(?:\.html)?' _VIDEOID_REGEXES = [ r'\bdata-video-id=["\'](\d{7,})', r'\bdata-player-videosids=["\'](\d{7,})', - r'\bvideosIDs\s*:\s*["\'](\d{7,})', + r'\bvideosIDs\s*:\s*["\']?(\d{7,})', r'\bdata-id=["\'](\d{7,})', r']+\bid=["\'](\d{7,})', ] _TESTS = [{ 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', - 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', 'info_dict': { - 'id': '3652183', - 'ext': 'mp4', - 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', - 'duration': 110.711, - 'uploader': 'Rede Globo', - 'uploader_id': '196', - } + 'id': 'novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes', + 'title': 'Novidade na fiscalização de bagagem pela Receita provoca discussões', + 'description': 'md5:c3c4b4d4c30c32fce460040b1ac46b12', + }, + 'playlist_count': 1, + }, { + 'url': 'http://g1.globo.com/pr/parana/noticia/2016/09/mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato.html', + 'info_dict': { + 'id': 'mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato', + 'title': "Lula era o 'comandante máximo' do esquema da Lava Jato, diz MPF", + 'description': 'md5:8aa7cc8beda4dc71cc8553e00b77c54c', + }, + 'playlist_count': 6, }, { 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html', 'only_matching': True, }, { 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html', 'only_matching': True, + }, { + 'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271', + 'only_matching': True, }] @classmethod @@ -432,5 +445,12 @@ class GloboArticleIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') - return self.url_result('globo:%s' % video_id, 'Globo') + video_ids = [] + for video_regex in self._VIDEOID_REGEXES: + video_ids.extend(re.findall(video_regex, webpage)) + entries = [ + self.url_result('globo:%s' % video_id, GloboIE.ie_key()) + for video_id in orderedSet(video_ids)] + title = self._og_search_title(webpage, fatal=False) + description = self._html_search_meta('description', webpage) + return self.playlist_result(entries, display_id, title, description) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py new file mode 100644 index 000000000..a34779b16 --- /dev/null +++ b/youtube_dl/extractor/go.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_ext, + parse_age_limit, + urlencode_postdata, + ExtractorError, +) + + +class GoIE(InfoExtractor): + _BRANDS = { + 'abc': '001', + 'freeform': '002', + 'watchdisneychannel': '004', + 'watchdisneyjunior': '008', + 'watchdisneyxd': '009', + } + _VALID_URL = r'https?://(?:(?P%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P\w+)|season-\d+/\d+-(?P[^/?#]+))' % '|'.join(_BRANDS.keys()) + _TESTS = [{ + 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', + 'info_dict': { + 'id': '0_g86w5onx', + 'ext': 'mp4', + 'title': 'Sneak Peek: Language Arts', + 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601', + 'only_matching': True, + }] + + def _real_extract(self, url): + sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() + if not video_id: + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id') + brand = self._BRANDS[sub_domain] + video_data = self._download_json( + 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id), + video_id)['video'][0] + title = video_data['title'] + + formats = [] + for asset in video_data.get('assets', {}).get('asset', []): + asset_url = asset.get('value') + if not asset_url: + continue + format_id = asset.get('format') + ext = determine_ext(asset_url) + if ext == 'm3u8': + video_type = video_data.get('type') + if video_type == 'lf': + entitlement = self._download_json( + 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', + video_id, data=urlencode_postdata({ + 'video_id': video_data['id'], + 'video_type': video_type, + 'brand': brand, + 'device': '001', + })) + errors = entitlement.get('errors', {}).get('errors', []) + if errors: + error_message = ', '.join([error['message'] for error in errors]) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + asset_url += '?' + entitlement['uplynkData']['sessionKey'] + formats.extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)) + else: + formats.append({ + 'format_id': format_id, + 'url': asset_url, + 'ext': ext, + }) + self._sort_formats(formats) + + subtitles = {} + for cc in video_data.get('closedcaption', {}).get('src', []): + cc_url = cc.get('value') + if not cc_url: + continue + ext = determine_ext(cc_url) + if ext == 'xml': + ext = 'ttml' + subtitles.setdefault(cc.get('lang'), []).append({ + 'url': cc_url, + 'ext': ext, + }) + + thumbnails = [] + for thumbnail in video_data.get('thumbnails', {}).get('thumbnail', []): + thumbnail_url = thumbnail.get('value') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('longdescription') or video_data.get('description'), + 'duration': int_or_none(video_data.get('duration', {}).get('value'), 1000), + 'age_limit': parse_age_limit(video_data.get('tvrating', {}).get('rating')), + 'episode_number': int_or_none(video_data.get('episodenumber')), + 'series': video_data.get('show', {}).get('title'), + 'season_number': int_or_none(video_data.get('season', {}).get('num')), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/godtube.py b/youtube_dl/extractor/godtube.py index 363dc6608..92efd16b3 100644 --- a/youtube_dl/extractor/godtube.py +++ b/youtube_dl/extractor/godtube.py @@ -23,7 +23,7 @@ class GodTubeIE(InfoExtractor): 'timestamp': 1205712000, 'uploader': 'beverlybmusic', 'upload_date': '20080317', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, ] diff --git a/youtube_dl/extractor/goldenmoustache.py b/youtube_dl/extractor/goldenmoustache.py deleted file mode 100644 index 0fb509724..000000000 --- a/youtube_dl/extractor/goldenmoustache.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class GoldenMoustacheIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?goldenmoustache\.com/(?P[\w-]+)-(?P\d+)' - _TESTS = [{ - 'url': 'http://www.goldenmoustache.com/suricate-le-poker-3700/', - 'md5': '0f904432fa07da5054d6c8beb5efb51a', - 'info_dict': { - 'id': '3700', - 'ext': 'mp4', - 'title': 'Suricate - Le Poker', - 'description': 'md5:3d1f242f44f8c8cb0a106f1fd08e5dc9', - 'thumbnail': 're:^https?://.*\.jpg$', - } - }, { - 'url': 'http://www.goldenmoustache.com/le-lab-tout-effacer-mc-fly-et-carlito-55249/', - 'md5': '27f0c50fb4dd5f01dc9082fc67cd5700', - 'info_dict': { - 'id': '55249', - 'ext': 'mp4', - 'title': 'Le LAB - Tout Effacer (Mc Fly et Carlito)', - 'description': 'md5:9b7fbf11023fb2250bd4b185e3de3b2a', - 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = self._html_search_regex( - r'data-src-type="mp4" data-src="([^"]+)"', webpage, 'video URL') - title = self._html_search_regex( - r'(.*?)(?: - Golden Moustache)?', webpage, 'title') - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 731bacd67..427499b11 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -10,7 +10,7 @@ from ..utils import unified_strdate class GooglePlusIE(InfoExtractor): IE_DESC = 'Google Plus' - _VALID_URL = r'https://plus\.google\.com/(?:[^/]+/)*?posts/(?P\w+)' + _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P\w+)' IE_NAME = 'plus.google' _TEST = { 'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH', diff --git a/youtube_dl/extractor/googlesearch.py b/youtube_dl/extractor/googlesearch.py index 498304cb2..5279fa807 100644 --- a/youtube_dl/extractor/googlesearch.py +++ b/youtube_dl/extractor/googlesearch.py @@ -4,9 +4,6 @@ import itertools import re from .common import SearchInfoExtractor -from ..compat import ( - compat_urllib_parse, -) class GoogleSearchIE(SearchInfoExtractor): @@ -34,13 +31,16 @@ class GoogleSearchIE(SearchInfoExtractor): } for pagenum in itertools.count(): - result_url = ( - 'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' - % (compat_urllib_parse.quote_plus(query), pagenum * 10)) - webpage = self._download_webpage( - result_url, 'gvsearch:' + query, - note='Downloading result page ' + str(pagenum + 1)) + 'http://www.google.com/search', + 'gvsearch:' + query, + note='Downloading result page %s' % (pagenum + 1), + query={ + 'tbm': 'vid', + 'q': query, + 'start': pagenum * 10, + 'hl': 'en', + }) for hit_idx, mobj in enumerate(re.finditer( r'

\d+?)($|/)' + _VALID_URL = r'https?://(?:www\.)?goshgay\.com/video(?P\d+?)($|/)' _TEST = { 'url': 'http://www.goshgay.com/video299069/diesel_sfw_xxx_video', 'md5': '4b6db9a0a333142eb9f15913142b0ed1', @@ -19,7 +19,7 @@ class GoshgayIE(InfoExtractor): 'id': '299069', 'ext': 'flv', 'title': 'DIESEL SFW XXX Video', - 'thumbnail': 're:^http://.*\.jpg$', + 'thumbnail': r're:^http://.*\.jpg$', 'duration': 80, 'age_limit': 18, } diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index b6cc15b6f..342a6130e 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,11 +1,11 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class HarkIE(InfoExtractor): - _VALID_URL = r'https?://www\.hark\.com/clips/(?P.+?)-.+' + _VALID_URL = r'https?://(?:www\.)?hark\.com/clips/(?P.+?)-.+' _TEST = { 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', 'md5': '6783a58491b47b92c7c1af5a77d4cbee', diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py index dad0f3994..8116ad9bd 100644 --- a/youtube_dl/extractor/hbo.py +++ b/youtube_dl/extractor/hbo.py @@ -12,17 +12,7 @@ from ..utils import ( ) -class HBOIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P[0-9]+)' - _TEST = { - 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', - 'md5': '1c33253f0c7782142c993c0ba62a8753', - 'info_dict': { - 'id': '1437839', - 'ext': 'mp4', - 'title': 'Ep. 64 Clip: Encryption', - } - } +class HBOBaseIE(InfoExtractor): _FORMATS_INFO = { '1920': { 'width': 1280, @@ -50,8 +40,7 @@ class HBOIE(InfoExtractor): }, } - def _real_extract(self, url): - video_id = self._match_id(url) + def _extract_from_id(self, video_id): video_data = self._download_xml( 'http://render.lv3.hbo.com/data/content/global/videos/data/%s.xml' % video_id, video_id) title = xpath_text(video_data, 'title', 'title', True) @@ -116,7 +105,60 @@ class HBOIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'duration': parse_duration(xpath_element(video_data, 'duration/tv14')), + 'duration': parse_duration(xpath_text(video_data, 'duration/tv14')), 'formats': formats, 'thumbnails': thumbnails, } + + +class HBOIE(HBOBaseIE): + _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P[0-9]+)' + _TEST = { + 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', + 'md5': '1c33253f0c7782142c993c0ba62a8753', + 'info_dict': { + 'id': '1437839', + 'ext': 'mp4', + 'title': 'Ep. 64 Clip: Encryption', + 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 1072, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_from_id(video_id) + + +class HBOEpisodeIE(HBOBaseIE): + _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?!video)([^/]+/)+video/(?P[0-9a-z-]+)\.html' + + _TESTS = [{ + 'url': 'http://www.hbo.com/girls/episodes/5/52-i-love-you-baby/video/ep-52-inside-the-episode.html?autoplay=true', + 'md5': '689132b253cc0ab7434237fc3a293210', + 'info_dict': { + 'id': '1439518', + 'display_id': 'ep-52-inside-the-episode', + 'ext': 'mp4', + 'title': 'Ep. 52: Inside the Episode', + 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 240, + }, + }, { + 'url': 'http://www.hbo.com/game-of-thrones/about/video/season-5-invitation-to-the-set.html?autoplay=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'(?P[\'"])videoId(?P=q1)\s*:\s*(?P[\'"])(?P\d+)(?P=q2)', + webpage, 'video ID', group='video_id') + + info_dict = self._extract_from_id(video_id) + info_dict['display_id'] = display_id + + return info_dict diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py index 256453882..18c252012 100644 --- a/youtube_dl/extractor/hearthisat.py +++ b/youtube_dl/extractor/hearthisat.py @@ -25,7 +25,7 @@ class HearThisAtIE(InfoExtractor): 'id': '150939', 'ext': 'wav', 'title': 'Moofi - Dr. Kreep', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1421564134, 'description': 'Listen to Dr. Kreep by Moofi on hearthis.at - Modular, Eurorack, Mutable Intruments Braids, Valhalla-DSP', 'upload_date': '20150118', @@ -46,7 +46,7 @@ class HearThisAtIE(InfoExtractor): 'description': 'Listen to DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix! by TwitchSF on hearthis.at - Dance', 'upload_date': '20160328', 'timestamp': 1459186146, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'comment_count': int, 'view_count': int, 'like_count': int, diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 278d9f527..1629cdb8d 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -29,7 +29,7 @@ class HeiseIE(InfoExtractor): 'timestamp': 1411812600, 'upload_date': '20140927', 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', - 'thumbnail': 're:^https?://.*\.jpe?g$', + 'thumbnail': r're:^https?://.*\.jpe?g$', } } diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py index 7a1c75b65..0ee8ea712 100644 --- a/youtube_dl/extractor/hellporno.py +++ b/youtube_dl/extractor/hellporno.py @@ -6,12 +6,13 @@ from .common import InfoExtractor from ..utils import ( js_to_json, remove_end, + determine_ext, ) class HellPornoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hellporno\.com/videos/(?P[^/]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?hellporno\.(?:com/videos|net/v)/(?P[^/]+)' + _TESTS = [{ 'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/', 'md5': '1fee339c610d2049699ef2aa699439f1', 'info_dict': { @@ -19,10 +20,13 @@ class HellPornoIE(InfoExtractor): 'display_id': 'dixie-is-posing-with-naked-ass-very-erotic', 'ext': 'mp4', 'title': 'Dixie is posing with naked ass very erotic', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, } - } + }, { + 'url': 'http://hellporno.net/v/186271/', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -38,7 +42,7 @@ class HellPornoIE(InfoExtractor): video_id = flashvars.get('video_id') thumbnail = flashvars.get('preview_url') - ext = flashvars.get('postfix', '.mp4')[1:] + ext = determine_ext(flashvars.get('postfix'), 'mp4') formats = [] for video_url_key in ['video_url', 'video_alt_url']: diff --git a/youtube_dl/extractor/helsinki.py b/youtube_dl/extractor/helsinki.py index 93107b306..575fb332a 100644 --- a/youtube_dl/extractor/helsinki.py +++ b/youtube_dl/extractor/helsinki.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py new file mode 100644 index 000000000..69543bff2 --- /dev/null +++ b/youtube_dl/extractor/hgtv.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + smuggle_url, +) + + +class HGTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hgtv\.ca/[^/]+/video/(?P[^/]+)/video.html' + _TEST = { + 'url': 'http://www.hgtv.ca/homefree/video/overnight-success/video.html?v=738081859718&p=1&s=da#video', + 'md5': '', + 'info_dict': { + 'id': 'aFH__I_5FBOX', + 'ext': 'mp4', + 'title': 'Overnight Success', + 'description': 'After weeks of hard work, high stakes, breakdowns and pep talks, the final 2 contestants compete to win the ultimate dream.', + 'uploader': 'SHWM-NEW', + 'timestamp': 1470320034, + 'upload_date': '20160804', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + embed_vars = self._parse_json(self._search_regex( + r'(?s)embed_vars\s*=\s*({.*?});', + webpage, 'embed vars'), display_id, js_to_json) + return { + '_type': 'url_transparent', + 'url': smuggle_url( + 'http://link.theplatform.com/s/dtjsEC/%s?mbr=true&manifest=m3u' % embed_vars['pid'], { + 'force_smil_url': True + }), + 'series': embed_vars.get('show'), + 'season_number': int_or_none(embed_vars.get('season')), + 'episode_number': int_or_none(embed_vars.get('episode')), + 'ie_key': 'ThePlatform', + } + + +class HGTVComShowIE(InfoExtractor): + IE_NAME = 'hgtv.com:show' + _VALID_URL = r'https?://(?:www\.)?hgtv\.com/shows/[^/]+/(?P[^/?#&]+)' + _TEST = { + 'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-videos', + 'info_dict': { + 'id': 'flip-or-flop-full-episodes-videos', + 'title': 'Flip or Flop Full Episodes', + }, + 'playlist_mincount': 15, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + config = self._parse_json( + self._search_regex( + r'(?s)data-module=["\']video["\'][^>]*>.*?]+type=["\']text/x-config["\'][^>]*>(.+?)\d+)' + _TEST = { + 'url': 'https://hitrecord.org/records/2954362', + 'md5': 'fe1cdc2023bce0bbb95c39c57426aa71', + 'info_dict': { + 'id': '2954362', + 'ext': 'mp4', + 'title': 'A Very Different World (HITRECORD x ACLU)', + 'description': 'md5:e62defaffab5075a5277736bead95a3d', + 'duration': 139.327, + 'timestamp': 1471557582, + 'upload_date': '20160818', + 'uploader': 'Zuzi.C12', + 'uploader_id': '362811', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'tags': list, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://hitrecord.org/api/web/records/%s' % video_id, video_id) + + title = video['title'] + video_url = video['source_url']['mp4_url'] + + tags = None + tags_list = try_get(video, lambda x: x['tags'], list) + if tags_list: + tags = [ + t['text'] + for t in tags_list + if isinstance(t, dict) and t.get('text') and + isinstance(t['text'], compat_str)] + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': clean_html(video.get('body')), + 'duration': float_or_none(video.get('duration'), 1000), + 'timestamp': int_or_none(video.get('created_at_i')), + 'uploader': try_get( + video, lambda x: x['user']['username'], compat_str), + 'uploader_id': try_get( + video, lambda x: compat_str(x['user']['id'])), + 'view_count': int_or_none(video.get('total_views_count')), + 'like_count': int_or_none(video.get('hearts_count')), + 'comment_count': int_or_none(video.get('comments_count')), + 'tags': tags, + } diff --git a/youtube_dl/extractor/hornbunny.py b/youtube_dl/extractor/hornbunny.py index 5b6efb27e..c458a959d 100644 --- a/youtube_dl/extractor/hornbunny.py +++ b/youtube_dl/extractor/hornbunny.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -14,29 +12,24 @@ class HornBunnyIE(InfoExtractor): _VALID_URL = r'http?://(?:www\.)?hornbunny\.com/videos/(?P[a-z-]+)-(?P\d+)\.html' _TEST = { 'url': 'http://hornbunny.com/videos/panty-slut-jerk-off-instruction-5227.html', - 'md5': '95e40865aedd08eff60272b704852ad7', + 'md5': 'e20fd862d1894b67564c96f180f43924', 'info_dict': { 'id': '5227', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'panty slut jerk off instruction', 'duration': 550, 'age_limit': 18, + 'view_count': int, + 'thumbnail': r're:^https?://.*\.jpg$', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, note='Downloading initial webpage') - title = self._html_search_regex( - r'class="title">(.*?)

', webpage, 'title') - redirect_url = self._html_search_regex( - r'pg&settings=(.*?)\|0"\);', webpage, 'title') - webpage2 = self._download_webpage(redirect_url, video_id) - video_url = self._html_search_regex( - r'flvMask:(.*?);', webpage2, 'video_url') + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] duration = parse_duration(self._search_regex( r'Runtime:\s*([0-9:]+)
', @@ -45,12 +38,12 @@ class HornBunnyIE(InfoExtractor): r'Views:\s*(\d+)
', webpage, 'view count', fatal=False)) - return { + info_dict.update({ 'id': video_id, - 'url': video_url, 'title': title, - 'ext': 'flv', 'duration': duration, 'view_count': view_count, 'age_limit': 18, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 9db565209..34163725f 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -12,7 +12,7 @@ from ..utils import ( class HotNewHipHopIE(InfoExtractor): - _VALID_URL = r'https?://www\.hotnewhiphop\.com/.*\.(?P.*)\.html' + _VALID_URL = r'https?://(?:www\.)?hotnewhiphop\.com/.*\.(?P.*)\.html' _TEST = { 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', 'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96', diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index 65ba2a48b..2be68abad 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -21,7 +21,7 @@ class HowStuffWorksIE(InfoExtractor): 'title': 'Cool Jobs - Iditarod Musher', 'description': 'Cold sleds, freezing temps and warm dog breath... an Iditarod musher\'s dream. Kasey-Dee Gardner jumps on a sled to find out what the big deal is.', 'display_id': 'cool-jobs-iditarod-musher', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 161, }, 'skip': 'Video broken', @@ -34,7 +34,7 @@ class HowStuffWorksIE(InfoExtractor): 'title': 'Survival Zone: Food and Water In the Savanna', 'description': 'Learn how to find both food and water while trekking in the African savannah. In this video from the Discovery Channel.', 'display_id': 'survival-zone-food-and-water-in-the-savanna', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -45,7 +45,7 @@ class HowStuffWorksIE(InfoExtractor): 'title': 'Sword Swallowing #1 by Dan Meyer', 'description': 'Video footage (1 of 3) used by permission of the owner Dan Meyer through Sword Swallowers Association International ', 'display_id': 'sword-swallowing-1-by-dan-meyer', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { diff --git a/youtube_dl/extractor/huajiao.py b/youtube_dl/extractor/huajiao.py new file mode 100644 index 000000000..4ca275dda --- /dev/null +++ b/youtube_dl/extractor/huajiao.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, +) + + +class HuajiaoIE(InfoExtractor): + IE_DESC = '花椒直播' + _VALID_URL = r'https?://(?:www\.)?huajiao\.com/l/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.huajiao.com/l/38941232', + 'md5': 'd08bf9ac98787d24d1e4c0283f2d372d', + 'info_dict': { + 'id': '38941232', + 'ext': 'mp4', + 'title': '#新人求关注#', + 'description': 're:.*', + 'duration': 2424.0, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1475866459, + 'upload_date': '20161007', + 'uploader': 'Penny_余姿昀', + 'uploader_id': '75206005', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + feed_json = self._search_regex( + r'var\s+feed\s*=\s*({.+})', webpage, 'feed json') + feed = self._parse_json(feed_json, video_id) + + description = self._html_search_meta( + 'description', webpage, 'description', fatal=False) + + def get(section, field): + return feed.get(section, {}).get(field) + + return { + 'id': video_id, + 'title': feed['feed']['formated_title'], + 'description': description, + 'duration': parse_duration(get('feed', 'duration')), + 'thumbnail': get('feed', 'image'), + 'timestamp': parse_iso8601(feed.get('creatime'), ' '), + 'uploader': get('author', 'nickname'), + 'uploader_id': get('author', 'uid'), + 'formats': self._extract_m3u8_formats( + feed['feed']['m3u8'], video_id, 'mp4', 'm3u8_native'), + } diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index 059073749..97e36f056 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -52,7 +52,7 @@ class HuffPostIE(InfoExtractor): thumbnails = [] for url in filter(None, data['images'].values()): - m = re.match('.*-([0-9]+x[0-9]+)\.', url) + m = re.match(r'.*-([0-9]+x[0-9]+)\.', url) if not m: continue thumbnails.append({ diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 0acce9f4c..f95c00c73 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -6,20 +6,21 @@ from .common import InfoExtractor from ..utils import ( mimetype2ext, qualities, + remove_end, ) class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-)vi(?P\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', - 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb', + 'title': 'Ice Age: Continental Drift Trailer (No. 2)', 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', } }, { @@ -31,6 +32,9 @@ class ImdbIE(InfoExtractor): }, { 'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897', 'only_matching': True, + }, { + 'url': 'http://www.imdb.com/videoplayer/vi1562949145', + 'only_matching': True, }] def _real_extract(self, url): @@ -83,17 +87,17 @@ class ImdbIE(InfoExtractor): return { 'id': video_id, - 'title': self._og_search_title(webpage), + 'title': remove_end(self._og_search_title(webpage), ' - IMDb'), 'formats': formats, 'description': descr, - 'thumbnail': format_info['slate'], + 'thumbnail': format_info.get('slate'), } class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'https?://www\.imdb\.com/list/(?P[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/(?P[\da-zA-Z_-]{11})' _TEST = { 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', 'info_dict': { diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 85e9344aa..67c24a51c 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -13,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|topic/[^/]+)/)?(?P[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -43,6 +43,9 @@ class ImgurIE(InfoExtractor): }, { 'url': 'http://imgur.com/topic/Funny/N8rOudd', 'only_matching': True, + }, { + 'url': 'http://imgur.com/r/aww/VQcQPhM', + 'only_matching': True, }] def _real_extract(self, url): @@ -50,12 +53,10 @@ class ImgurIE(InfoExtractor): webpage = self._download_webpage( compat_urlparse.urljoin(url, video_id), video_id) - width = int_or_none(self._search_regex( - r'(.*?)', diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index 65712abc2..9544ff9d4 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/inc.py b/youtube_dl/extractor/inc.py new file mode 100644 index 000000000..241ec83c4 --- /dev/null +++ b/youtube_dl/extractor/inc.py @@ -0,0 +1,41 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .kaltura import KalturaIE + + +class IncIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?inc\.com/(?:[^/]+/)+(?P[^.]+).html' + _TESTS = [{ + 'url': 'http://www.inc.com/tip-sheet/bill-gates-says-these-5-books-will-make-you-smarter.html', + 'md5': '7416739c9c16438c09fa35619d6ba5cb', + 'info_dict': { + 'id': '1_wqig47aq', + 'ext': 'mov', + 'title': 'Bill Gates Says These 5 Books Will Make You Smarter', + 'description': 'md5:bea7ff6cce100886fc1995acb743237e', + 'timestamp': 1474414430, + 'upload_date': '20160920', + 'uploader_id': 'video@inc.com', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.inc.com/video/david-whitford/founders-forum-tripadvisor-steve-kaufer-most-enjoyable-moment-for-entrepreneur.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + partner_id = self._search_regex( + r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, 'partner id') + + kaltura_id = self._parse_json(self._search_regex( + r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), + display_id)['vid_kaltura_id'] + + return self.url_result( + 'kaltura:%s:%s' % (partner_id, kaltura_id), KalturaIE.ie_key()) diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index c6f080484..11cf3c609 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -19,7 +19,7 @@ class IndavideoEmbedIE(InfoExtractor): 'ext': 'mp4', 'title': 'Cicatánc', 'description': '', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'cukiajanlo', 'uploader_id': '83729', 'timestamp': 1439193826, @@ -102,7 +102,7 @@ class IndavideoIE(InfoExtractor): 'ext': 'mp4', 'title': 'Vicces cica', 'description': 'Játszik a tablettel. :D', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Jet_Pack', 'uploader_id': '491217', 'timestamp': 1390821212, diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index cca0b8a93..9fb71e8ef 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import base64 -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_urlparse, +) from ..utils import determine_ext from .bokecc import BokeCCBaseIE @@ -33,9 +36,21 @@ class InfoQIE(BokeCCBaseIE): 'ext': 'flv', 'description': 'md5:308d981fb28fa42f49f9568322c683ff', }, + }, { + 'url': 'https://www.infoq.com/presentations/Simple-Made-Easy', + 'md5': '0e34642d4d9ef44bf86f66f6399672db', + 'info_dict': { + 'id': 'Simple-Made-Easy', + 'title': 'Simple Made Easy', + 'ext': 'mp3', + 'description': 'md5:3e0e213a8bbd074796ef89ea35ada25b', + }, + 'params': { + 'format': 'bestaudio', + }, }] - def _extract_rtmp_videos(self, webpage): + def _extract_rtmp_video(self, webpage): # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' @@ -47,28 +62,53 @@ class InfoQIE(BokeCCBaseIE): playpath = 'mp4:' + real_id return [{ - 'format_id': 'rtmp', + 'format_id': 'rtmp_video', 'url': video_url, 'ext': determine_ext(playpath), 'play_path': playpath, }] - def _extract_http_videos(self, webpage): - http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') - + def _extract_cookies(self, webpage): policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') + return 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( + policy, signature, key_pair_id) + def _extract_http_video(self, webpage): + http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') return [{ - 'format_id': 'http', + 'format_id': 'http_video', 'url': http_video_url, 'http_headers': { - 'Cookie': 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( - policy, signature, key_pair_id), + 'Cookie': self._extract_cookies(webpage) }, }] + def _extract_http_audio(self, webpage, video_id): + fields = self._hidden_inputs(webpage) + http_audio_url = fields['filename'] + if http_audio_url is None: + return [] + + cookies_header = {'Cookie': self._extract_cookies(webpage)} + + # base URL is found in the Location header in the response returned by + # GET https://www.infoq.com/mp3download.action?filename=... when logged in. + http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + + # audio file seem to be missing some times even if there is a download link + # so probe URL to make sure + if not self._is_valid_url(http_audio_url, video_id, headers=cookies_header): + return [] + + return [{ + 'format_id': 'http_audio', + 'url': http_audio_url, + 'vcodec': 'none', + 'http_headers': cookies_header, + }] + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -80,7 +120,10 @@ class InfoQIE(BokeCCBaseIE): # for China videos, HTTP video URL exists but always fails with 403 formats = self._extract_bokecc_formats(webpage, video_id) else: - formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) + formats = ( + self._extract_rtmp_video(webpage) + + self._extract_http_video(webpage) + + self._extract_http_audio(webpage, video_id)) self._sort_formats(formats) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 8f7f232be..98f408c18 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -22,13 +22,14 @@ class InstagramIE(InfoExtractor): 'ext': 'mp4', 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', 'uploader': 'Naomi Leonor Phan-Quang', 'like_count': int, 'comment_count': int, + 'comments': list, }, }, { # missing description @@ -37,13 +38,14 @@ class InstagramIE(InfoExtractor): 'id': 'BA-pQFBG8HZ', 'ext': 'mp4', 'title': 'Video by britneyspears', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1453760977, 'upload_date': '20160125', 'uploader_id': 'britneyspears', 'uploader': 'Britney Spears', 'like_count': int, 'comment_count': int, + 'comments': list, }, 'params': { 'skip_download': True, @@ -82,7 +84,7 @@ class InstagramIE(InfoExtractor): webpage = self._download_webpage(url, video_id) (video_url, description, thumbnail, timestamp, uploader, - uploader_id, like_count, comment_count) = [None] * 8 + uploader_id, like_count, comment_count, height, width) = [None] * 10 shared_data = self._parse_json( self._search_regex( @@ -94,6 +96,8 @@ class InstagramIE(InfoExtractor): shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict) if media: video_url = media.get('video_url') + height = int_or_none(media.get('dimensions', {}).get('height')) + width = int_or_none(media.get('dimensions', {}).get('width')) description = media.get('caption') thumbnail = media.get('display_src') timestamp = int_or_none(media.get('date')) @@ -101,10 +105,24 @@ class InstagramIE(InfoExtractor): uploader_id = media.get('owner', {}).get('username') like_count = int_or_none(media.get('likes', {}).get('count')) comment_count = int_or_none(media.get('comments', {}).get('count')) + comments = [{ + 'author': comment.get('user', {}).get('username'), + 'author_id': comment.get('user', {}).get('id'), + 'id': comment.get('id'), + 'text': comment.get('text'), + 'timestamp': int_or_none(comment.get('created_at')), + } for comment in media.get( + 'comments', {}).get('nodes', []) if comment.get('text')] if not video_url: video_url = self._og_search_video_url(webpage, secure=False) + formats = [{ + 'url': video_url, + 'width': width, + 'height': height, + }] + if not uploader_id: uploader_id = self._search_regex( r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', @@ -121,7 +139,7 @@ class InstagramIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'ext': 'mp4', 'title': 'Video by %s' % uploader_id, 'description': description, @@ -131,6 +149,7 @@ class InstagramIE(InfoExtractor): 'uploader': uploader, 'like_count': like_count, 'comment_count': comment_count, + 'comments': comments, } @@ -150,7 +169,7 @@ class InstagramUserIE(InfoExtractor): 'id': '614605558512799803_462752227', 'ext': 'mp4', 'title': '#Porsche Intelligent Performance.', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'uploader': 'Porsche', 'uploader_id': 'porsche', 'timestamp': 1387486713, diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 45add007f..76cc5ec3e 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -48,13 +48,23 @@ class InternetVideoArchiveIE(InfoExtractor): # There are multiple videos in the playlist whlie only the first one # matches the video played in browsers video_info = configuration['playlist'][0] + title = video_info['title'] formats = [] for source in video_info['sources']: file_url = source['file'] if determine_ext(file_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, ext='mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + file_url = m3u8_formats[0]['url'] + formats.extend(self._extract_f4m_formats( + file_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + file_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) else: a_format = { 'url': file_url, @@ -70,7 +80,6 @@ class InternetVideoArchiveIE(InfoExtractor): self._sort_formats(formats) - title = video_info['title'] description = video_info.get('description') thumbnail = video_info.get('image') else: diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 788bbe0d5..0fe576883 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -65,7 +65,7 @@ class IPrimaIE(InfoExtractor): options = self._parse_json( self._search_regex( - r'(?s)var\s+playerOptions\s*=\s*({.+?});', + r'(?s)(?:TDIPlayerOptions|playerOptions)\s*=\s*({.+?});\s*\]\]', playerpage, 'player options', default='{}'), video_id, transform_source=js_to_json, fatal=False) if options: @@ -81,6 +81,9 @@ class IPrimaIE(InfoExtractor): for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage): extract_formats(src) + if not formats and '>GEO_IP_NOT_ALLOWED<' in playerpage: + self.raise_geo_restricted() + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 214bcd5b5..d5a3f6fa5 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -14,7 +14,7 @@ class Ir90TvIE(InfoExtractor): 'id': '95719', 'ext': 'mp4', 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py new file mode 100644 index 000000000..b0d860452 --- /dev/null +++ b/youtube_dl/extractor/itv.py @@ -0,0 +1,196 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import uuid +import xml.etree.ElementTree as etree +import json + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_etree_register_namespace, +) +from ..utils import ( + extract_attributes, + xpath_with_ns, + xpath_element, + xpath_text, + int_or_none, + parse_duration, + ExtractorError, + determine_ext, +) + + +class ITVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-zA-Z]+)' + _TEST = { + 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', + 'info_dict': { + 'id': '2a2936a0053', + 'ext': 'flv', + 'title': 'Home Movie', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + params = extract_attributes(self._search_regex( + r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) + + ns_map = { + 'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/', + 'tem': 'http://tempuri.org/', + 'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types', + 'com': 'http://schemas.itv.com/2009/05/Common', + } + for ns, full_ns in ns_map.items(): + compat_etree_register_namespace(ns, full_ns) + + def _add_ns(name): + return xpath_with_ns(name, ns_map) + + def _add_sub_element(element, name): + return etree.SubElement(element, _add_ns(name)) + + req_env = etree.Element(_add_ns('soapenv:Envelope')) + _add_sub_element(req_env, 'soapenv:Header') + body = _add_sub_element(req_env, 'soapenv:Body') + get_playlist = _add_sub_element(body, ('tem:GetPlaylist')) + request = _add_sub_element(get_playlist, 'tem:request') + _add_sub_element(request, 'itv:ProductionId').text = params['data-video-id'] + _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper() + vodcrid = _add_sub_element(request, 'itv:Vodcrid') + _add_sub_element(vodcrid, 'com:Id') + _add_sub_element(request, 'itv:Partition') + user_info = _add_sub_element(get_playlist, 'tem:userInfo') + _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv' + _add_sub_element(user_info, 'itv:DM') + _add_sub_element(user_info, 'itv:RevenueScienceValue') + _add_sub_element(user_info, 'itv:SessionId') + _add_sub_element(user_info, 'itv:SsoToken') + _add_sub_element(user_info, 'itv:UserToken') + site_info = _add_sub_element(get_playlist, 'tem:siteInfo') + _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None' + _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV' + _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any' + _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO' + _add_sub_element(site_info, 'itv:Category') + _add_sub_element(site_info, 'itv:Platform').text = 'DotCom' + _add_sub_element(site_info, 'itv:Site').text = 'ItvCom' + device_info = _add_sub_element(get_playlist, 'tem:deviceInfo') + _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big' + player_info = _add_sub_element(get_playlist, 'tem:playerInfo') + _add_sub_element(player_info, 'itv:Version').text = '2' + + headers = self.geo_verification_headers() + headers.update({ + 'Content-Type': 'text/xml; charset=utf-8', + 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist', + }) + resp_env = self._download_xml( + params['data-playlist-url'], video_id, + headers=headers, data=etree.tostring(req_env)) + playlist = xpath_element(resp_env, './/Playlist') + if playlist is None: + fault_string = xpath_text(resp_env, './/faultstring') + raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string)) + title = xpath_text(playlist, 'EpisodeTitle', fatal=True) + video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) + media_files = xpath_element(video_element, 'MediaFiles', fatal=True) + rtmp_url = media_files.attrib['base'] + + formats = [] + for media_file in media_files.findall('MediaFile'): + play_path = xpath_text(media_file, 'URL') + if not play_path: + continue + tbr = int_or_none(media_file.get('bitrate'), 1000) + formats.append({ + 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), + 'url': rtmp_url, + 'play_path': play_path, + 'tbr': tbr, + 'ext': 'flv', + }) + + ios_playlist_url = params.get('data-video-playlist') + hmac = params.get('data-video-hmac') + if ios_playlist_url and hmac: + headers = self.geo_verification_headers() + headers.update({ + 'Accept': 'application/vnd.itv.vod.playlist.v2+json', + 'Content-Type': 'application/json', + 'hmac': hmac.upper(), + }) + ios_playlist = self._download_json( + ios_playlist_url, video_id, data=json.dumps({ + 'user': { + 'itvUserId': '', + 'entitlements': [], + 'token': '' + }, + 'device': { + 'manufacturer': 'Apple', + 'model': 'iPad', + 'os': { + 'name': 'iPhone OS', + 'version': '9.3', + 'type': 'ios' + } + }, + 'client': { + 'version': '4.1', + 'id': 'browser' + }, + 'variantAvailability': { + 'featureset': { + 'min': ['hls', 'aes'], + 'max': ['hls', 'aes'] + }, + 'platformTag': 'mobile' + } + }).encode(), headers=headers, fatal=False) + if ios_playlist: + video_data = ios_playlist.get('Playlist', {}).get('Video', {}) + ios_base_url = video_data.get('Base') + for media_file in video_data.get('MediaFiles', []): + href = media_file.get('Href') + if not href: + continue + if ios_base_url: + href = ios_base_url + href + ext = determine_ext(href) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats(href, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': href, + }) + self._sort_formats(formats) + + subtitles = {} + for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): + if not caption_url.text: + continue + ext = determine_ext(caption_url.text, 'ttml') + subtitles.setdefault('en', []).append({ + 'url': caption_url.text, + 'ext': 'ttml' if ext == 'xml' else ext, + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'episode_title': title, + 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), + 'series': xpath_text(playlist, 'ProgrammeTitle'), + 'duartion': parse_duration(xpath_text(playlist, 'Duration')), + } diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 472d72b4c..3d3c15024 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -8,7 +8,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, + qualities, ) @@ -28,7 +28,7 @@ class IviIE(InfoExtractor): 'title': 'Иван Васильевич меняет профессию', 'description': 'md5:b924063ea1677c8fe343d8a72ac2195f', 'duration': 5498, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', }, @@ -46,14 +46,30 @@ class IviIE(InfoExtractor): 'episode': 'Дело Гольдберга (1 часть)', 'episode_number': 1, 'duration': 2655, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'skip': 'Only works from Russia', + }, + { + # with MP4-HD720 format + 'url': 'http://www.ivi.ru/watch/146500', + 'md5': 'd63d35cdbfa1ea61a5eafec7cc523e1e', + 'info_dict': { + 'id': '146500', + 'ext': 'mp4', + 'title': 'Кукла', + 'description': 'md5:ffca9372399976a2d260a407cc74cce6', + 'duration': 5599, + 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', } ] # Sorted by quality - _KNOWN_FORMATS = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] + _KNOWN_FORMATS = ( + 'MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', + 'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080') def _real_extract(self, url): video_id = self._match_id(url) @@ -69,10 +85,9 @@ class IviIE(InfoExtractor): ] } - request = sanitized_Request( - 'http://api.digitalaccess.ru/api/json/', json.dumps(data)) video_json = self._download_json( - request, video_id, 'Downloading video JSON') + 'http://api.digitalaccess.ru/api/json/', video_id, + 'Downloading video JSON', data=json.dumps(data)) if 'error' in video_json: error = video_json['error'] @@ -84,11 +99,13 @@ class IviIE(InfoExtractor): result = video_json['result'] + quality = qualities(self._KNOWN_FORMATS) + formats = [{ 'url': x['url'], - 'format_id': x['content_format'], - 'preference': self._KNOWN_FORMATS.index(x['content_format']), - } for x in result['files'] if x['content_format'] in self._KNOWN_FORMATS] + 'format_id': x.get('content_format'), + 'quality': quality(x.get('content_format')), + } for x in result['files'] if x.get('url')] self._sort_formats(formats) @@ -115,7 +132,7 @@ class IviIE(InfoExtractor): webpage, 'season number', default=None)) episode_number = int_or_none(self._search_regex( - r']+itemprop="episode"[^>]*>\s*]+itemprop="episodeNumber"[^>]+content="(\d+)', + r'[^>]+itemprop="episode"[^>]*>\s*]+itemprop="episodeNumber"[^>]+content="(\d+)', webpage, 'episode number', default=None)) description = self._og_search_description(webpage, default=None) or self._html_search_meta( diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py new file mode 100644 index 000000000..8d7e7f472 --- /dev/null +++ b/youtube_dl/extractor/iwara.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from ..utils import remove_end + + +class IwaraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', + 'md5': '1d53866b2c514b23ed69e4352fdc9839', + 'info_dict': { + 'id': 'amVwUl1EHpAD9RD', + 'ext': 'mp4', + 'title': '【MMD R-18】ガールフレンド carry_me_off', + 'age_limit': 18, + }, + }, { + 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO', + 'md5': '7e5f1f359cd51a027ba4a7b7710a50f0', + 'info_dict': { + 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc', + 'ext': 'mp4', + 'title': '[3D Hentai] Kyonyu Ã\x97 Genkai Ã\x97 Emaki Shinobi Girls.mp4', + 'age_limit': 18, + }, + 'add_ie': ['GoogleDrive'], + }, { + 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq', + 'md5': '1d85f1e5217d2791626cff5ec83bb189', + 'info_dict': { + 'id': '6liAP9s2Ojc', + 'ext': 'mp4', + 'age_limit': 0, + 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)', + 'description': 'md5:590c12c0df1443d833fbebe05da8c47a', + 'upload_date': '20160910', + 'uploader': 'aMMDsork', + 'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A', + }, + 'add_ie': ['Youtube'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + hostname = compat_urllib_parse_urlparse(urlh.geturl()).hostname + # ecchi is 'sexy' in Japanese + age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 + + entries = self._parse_html5_media_entries(url, webpage, video_id) + + if not entries: + iframe_url = self._html_search_regex( + r']+src=([\'"])(?P[^\'"]+)\1', + webpage, 'iframe URL', group='url') + return { + '_type': 'url_transparent', + 'url': iframe_url, + 'age_limit': age_limit, + } + + title = remove_end(self._html_search_regex( + r'([^<]+)', webpage, 'title'), ' | Iwara') + + info_dict = entries[0] + info_dict.update({ + 'id': video_id, + 'title': title, + 'age_limit': age_limit, + }) + + return info_dict diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index aa0728abc..b1d72177d 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -29,7 +29,7 @@ class IzleseneIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sevinçten Çıldırtan Doğum Günü Hediyesi', 'description': 'md5:253753e2655dde93f59f74b572454f6d', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'uploader_id': 'pelikzzle', 'timestamp': int, 'upload_date': '20140702', @@ -44,7 +44,7 @@ class IzleseneIE(InfoExtractor): 'id': '17997', 'ext': 'mp4', 'title': 'Tarkan Dortmund 2006 Konseri', - 'thumbnail': 're:^https://.*\.jpg', + 'thumbnail': r're:^https://.*\.jpg', 'uploader_id': 'parlayankiz', 'timestamp': int, 'upload_date': '20061112', diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py new file mode 100644 index 000000000..595d7a5b7 --- /dev/null +++ b/youtube_dl/extractor/jamendo.py @@ -0,0 +1,138 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from ..compat import compat_urlparse +from .common import InfoExtractor +from ..utils import parse_duration + + +class JamendoBaseIE(InfoExtractor): + def _extract_meta(self, webpage, fatal=True): + title = self._og_search_title( + webpage, default=None) or self._search_regex( + r'([^<]+)', webpage, + 'title', default=None) + if title: + title = self._search_regex( + r'(.+?)\s*\|\s*Jamendo Music', title, 'title', default=None) + if not title: + title = self._html_search_meta( + 'name', webpage, 'title', fatal=fatal) + mobj = re.search(r'(.+) - (.+)', title or '') + artist, second = mobj.groups() if mobj else [None] * 2 + return title, artist, second + + +class JamendoIE(JamendoBaseIE): + _VALID_URL = r'https?://(?:www\.)?jamendo\.com/track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i', + 'md5': '6e9e82ed6db98678f171c25a8ed09ffd', + 'info_dict': { + 'id': '196219', + 'display_id': 'stories-from-emona-i', + 'ext': 'flac', + 'title': 'Maya Filipič - Stories from Emona I', + 'artist': 'Maya Filipič', + 'track': 'Stories from Emona I', + 'duration': 210, + 'thumbnail': r're:^https?://.*\.jpg' + } + } + + def _real_extract(self, url): + mobj = self._VALID_URL_RE.match(url) + track_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + title, artist, track = self._extract_meta(webpage) + + formats = [{ + 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' + % (sub_domain, track_id, format_id), + 'format_id': format_id, + 'ext': ext, + 'quality': quality, + } for quality, (format_id, sub_domain, ext) in enumerate(( + ('mp31', 'mp3l', 'mp3'), + ('mp32', 'mp3d', 'mp3'), + ('ogg1', 'ogg', 'ogg'), + ('flac', 'flac', 'flac'), + ))] + self._sort_formats(formats) + + thumbnail = self._html_search_meta( + 'image', webpage, 'thumbnail', fatal=False) + duration = parse_duration(self._search_regex( + r'<span[^>]+itemprop=["\']duration["\'][^>]+content=["\'](.+?)["\']', + webpage, 'duration', fatal=False)) + + return { + 'id': track_id, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'title': title, + 'duration': duration, + 'artist': artist, + 'track': track, + 'formats': formats + } + + +class JamendoAlbumIE(JamendoBaseIE): + _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)/(?P<display_id>[\w-]+)' + _TEST = { + 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', + 'info_dict': { + 'id': '121486', + 'title': 'Shearer - Duck On Cover' + }, + 'playlist': [{ + 'md5': 'e1a2fcb42bda30dfac990212924149a8', + 'info_dict': { + 'id': '1032333', + 'ext': 'flac', + 'title': 'Shearer - Warmachine', + 'artist': 'Shearer', + 'track': 'Warmachine', + } + }, { + 'md5': '1f358d7b2f98edfe90fd55dac0799d50', + 'info_dict': { + 'id': '1032330', + 'ext': 'flac', + 'title': 'Shearer - Without Your Ghost', + 'artist': 'Shearer', + 'track': 'Without Your Ghost', + } + }], + 'params': { + 'playlistend': 2 + } + } + + def _real_extract(self, url): + mobj = self._VALID_URL_RE.match(url) + album_id = mobj.group('id') + + webpage = self._download_webpage(url, mobj.group('display_id')) + + title, artist, album = self._extract_meta(webpage, fatal=False) + + entries = [{ + '_type': 'url_transparent', + 'url': compat_urlparse.urljoin(url, m.group('path')), + 'ie_key': JamendoIE.ie_key(), + 'id': self._search_regex( + r'/track/(\d+)', m.group('path'), 'track id', default=None), + 'artist': artist, + 'album': album, + } for m in re.finditer( + r'<a[^>]+href=(["\'])(?P<path>(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link', + webpage)] + + return self.playlist_result(entries, album_id, title) diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py index cf73cd753..f9a034b78 100644 --- a/youtube_dl/extractor/jove.py +++ b/youtube_dl/extractor/jove.py @@ -21,7 +21,7 @@ class JoveIE(InfoExtractor): 'ext': 'mp4', 'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation', 'description': 'md5:015dd4509649c0908bc27f049e0262c6', - 'thumbnail': 're:^https?://.*\.png$', + 'thumbnail': r're:^https?://.*\.png$', 'upload_date': '20110523', } }, @@ -33,7 +33,7 @@ class JoveIE(InfoExtractor): 'ext': 'mp4', 'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment', 'description': 'md5:35ff029261900583970c4023b70f1dc9', - 'thumbnail': 're:^https?://.*\.png$', + 'thumbnail': r're:^https?://.*\.png$', 'upload_date': '20140802', } }, diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py index 122e2dd8c..4b5f346d1 100644 --- a/youtube_dl/extractor/jpopsukitv.py +++ b/youtube_dl/extractor/jpopsukitv.py @@ -1,4 +1,4 @@ -# coding=utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 2a499bb77..aff7ab49a 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -9,7 +9,9 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + js_to_json, mimetype2ext, + urljoin, ) @@ -19,30 +21,40 @@ class JWPlatformBaseIE(InfoExtractor): # TODO: Merge this with JWPlayer-related codes in generic.py mobj = re.search( - 'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\((?P<options>[^)]+)\)', + r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)', webpage) if mobj: return mobj.group('options') def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): jwplayer_data = self._parse_json( - self._find_jwplayer_data(webpage), video_id) + self._find_jwplayer_data(webpage), video_id, + transform_source=js_to_json) return self._parse_jwplayer_data( jwplayer_data, video_id, *args, **kwargs) - def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None, base_url=None): + def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): # JWPlayer backward compatibility: flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 if 'playlist' not in jwplayer_data: jwplayer_data = {'playlist': [jwplayer_data]} entries = [] + + # JWPlayer backward compatibility: single playlist item + # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 + if not isinstance(jwplayer_data['playlist'], list): + jwplayer_data['playlist'] = [jwplayer_data['playlist']] + for video_data in jwplayer_data['playlist']: # JWPlayer backward compatibility: flattened sources # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 if 'sources' not in video_data: video_data['sources'] = [video_data] + this_video_id = video_id or video_data['mediaid'] + formats = [] for source in video_data['sources']: source_url = self._proto_relative_url(source['file']) @@ -52,7 +64,10 @@ class JWPlatformBaseIE(InfoExtractor): ext = mimetype2ext(source_type) or determine_ext(source_url) if source_type == 'hls' or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + source_url, this_video_id, mpd_id=mpd_id, fatal=False)) # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): formats.append({ @@ -61,14 +76,21 @@ class JWPlatformBaseIE(InfoExtractor): 'ext': ext, }) else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like 1080p. + height = int_or_none(self._search_regex( + r'^(\d{3,})[pP]$', source.get('label') or '', + 'height', default=None)) a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), + 'height': height, 'ext': ext, } if source_url.startswith('rtmp'): - a_format['ext'] = 'flv', + a_format['ext'] = 'flv' # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as # of jwplayer.flash.swf @@ -89,18 +111,22 @@ class JWPlatformBaseIE(InfoExtractor): tracks = video_data.get('tracks') if tracks and isinstance(tracks, list): for track in tracks: - if track.get('file') and track.get('kind') == 'captions': - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track['file']) - }) + if track.get('kind') != 'captions': + continue + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url) + }) entries.append({ - 'id': video_id, + 'id': this_video_id, 'title': video_data['title'] if require_title else video_data.get('title'), 'description': video_data.get('description'), 'thumbnail': self._proto_relative_url(video_data.get('image')), 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration')), + 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, 'formats': formats, }) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index ddf1165ff..5ef382f9f 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -36,6 +36,12 @@ class KalturaIE(InfoExtractor): ''' _SERVICE_URL = 'http://cdnapi.kaltura.com' _SERVICE_BASE = '/api_v3/index.php' + # See https://github.com/kaltura/server/blob/master/plugins/content/caption/base/lib/model/enums/CaptionType.php + _CAPTION_TYPES = { + 1: 'srt', + 2: 'ttml', + 3: 'vtt', + } _TESTS = [ { 'url': 'kaltura:269692:1_1jc2y3e4', @@ -67,6 +73,27 @@ class KalturaIE(InfoExtractor): # video with subtitles 'url': 'kaltura:111032:1_cw786r8q', 'only_matching': True, + }, + { + # video with ttml subtitles (no fileExt) + 'url': 'kaltura:1926081:0_l5ye1133', + 'info_dict': { + 'id': '0_l5ye1133', + 'ext': 'mp4', + 'title': 'What Can You Do With Python?', + 'upload_date': '20160221', + 'uploader_id': 'stork', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'timestamp': int, + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + }, + 'params': { + 'skip_download': True, + }, } ] @@ -78,20 +105,20 @@ class KalturaIE(InfoExtractor): kWidget\.(?:thumb)?[Ee]mbed\( \{.*? (?P<q1>['\"])wid(?P=q1)\s*:\s* - (?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*? + (?P<q2>['\"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*? (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s* - (?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4), + (?P<q4>['\"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) """, webpage) or re.search( r'''(?xs) (?P<q1>["\']) - (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*? + (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/(?:(?!(?P=q1)).)*(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)* (?P=q1).*? (?: entry_?[Ii]d| (?P<q2>["\'])entry_?[Ii]d(?P=q2) )\s*:\s* - (?P<q3>["\'])(?P<id>.+?)(?P=q3) + (?P<q3>["\'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3) ''', webpage)) if mobj: embed_info = mobj.groupdict() @@ -122,18 +149,6 @@ class KalturaIE(InfoExtractor): return data - def _get_kaltura_signature(self, video_id, partner_id, service_url=None): - actions = [{ - 'apiVersion': '3.1', - 'expiry': 86400, - 'format': 1, - 'service': 'session', - 'action': 'startWidgetSession', - 'widgetId': '_%s' % partner_id, - }] - return self._kaltura_api_call( - video_id, actions, service_url, note='Downloading Kaltura signature')['ks'] - def _get_video_info(self, video_id, partner_id, service_url=None): actions = [ { @@ -208,6 +223,17 @@ class KalturaIE(InfoExtractor): reference_id)['entryResult'] info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] entry_id = info['id'] + # Unfortunately, data returned in kalturaIframePackageData lacks + # captions so we will try requesting the complete data using + # regular approach since we now know the entry_id + try: + _, info, flavor_assets, captions = self._get_video_info( + entry_id, partner_id) + except ExtractorError: + # Regular scenario failed but we already have everything + # extracted apart from captions and can process at least + # with this + pass else: raise ExtractorError('Invalid URL', expected=True) ks = params.get('flashvars[ks]', [None])[0] @@ -236,8 +262,22 @@ class KalturaIE(InfoExtractor): # Continue if asset is not ready if f.get('status') != 2: continue + # Original format that's not available (e.g. kaltura:1926081:0_c03e1b5g) + # skip for now. + if f.get('fileExt') == 'chun': + continue + if not f.get('fileExt'): + # QT indicates QuickTime; some videos have broken fileExt + if f.get('containerFormat') == 'qt': + f['fileExt'] = 'mov' + else: + f['fileExt'] = 'mp4' video_url = sign_url( '%s/flavorId/%s' % (data_url, f['id'])) + # audio-only has no videoCodecId (e.g. kaltura:1926081:0_c03e1b5g + # -f mp4-56) + vcodec = 'none' if 'videoCodecId' not in f and f.get( + 'frameRate') == 0 else f.get('videoCodecId') formats.append({ 'format_id': '%(fileExt)s-%(bitrate)s' % f, 'ext': f.get('fileExt'), @@ -245,7 +285,7 @@ class KalturaIE(InfoExtractor): 'fps': int_or_none(f.get('frameRate')), 'filesize_approx': int_or_none(f.get('size'), invscale=1024), 'container': f.get('containerFormat'), - 'vcodec': f.get('videoCodecId'), + 'vcodec': vcodec, 'height': int_or_none(f.get('height')), 'width': int_or_none(f.get('width')), 'url': video_url, @@ -265,9 +305,12 @@ class KalturaIE(InfoExtractor): # Continue if caption is not ready if f.get('status') != 2: continue + if not caption.get('id'): + continue + caption_format = int_or_none(caption.get('format')) subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), - 'ext': caption.get('fileExt'), + 'ext': caption.get('fileExt') or self._CAPTION_TYPES.get(caption_format) or 'ttml', }) return { @@ -279,6 +322,6 @@ class KalturaIE(InfoExtractor): 'thumbnail': info.get('thumbnailUrl'), 'duration': info.get('duration'), 'timestamp': info.get('createdAt'), - 'uploader_id': info.get('userId'), + 'uploader_id': info.get('userId') if info.get('userId') != 'None' else None, 'view_count': info.get('plays'), } diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index a6050c4de..bfccf89b0 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class KaraoketvIE(InfoExtractor): - _VALID_URL = r'http://www.karaoketv.co.il/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?karaoketv\.co\.il/[^/]+/(?P<id>\d+)' _TEST = { 'url': 'http://www.karaoketv.co.il/%D7%A9%D7%99%D7%A8%D7%99_%D7%A7%D7%A8%D7%99%D7%95%D7%A7%D7%99/58356/%D7%90%D7%99%D7%96%D7%95%D7%9F', 'info_dict': { diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index c05263e61..4e9eb67bf 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -20,7 +20,7 @@ class KarriereVideosIE(InfoExtractor): 'ext': 'flv', 'title': 'AltenpflegerIn', 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2', - 'thumbnail': 're:^http://.*\.png', + 'thumbnail': r're:^http://.*\.png', }, 'params': { # rtmp download @@ -34,7 +34,7 @@ class KarriereVideosIE(InfoExtractor): 'ext': 'flv', 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"', 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33', - 'thumbnail': 're:^http://.*\.png', + 'thumbnail': r're:^http://.*\.png', }, 'params': { # rtmp download diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 126ca13df..e83115e2a 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -3,64 +3,126 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..aes import aes_decrypt_text +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( - sanitized_Request, - url_basename, + determine_ext, + ExtractorError, + int_or_none, + str_to_int, + strip_or_none, ) class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/.+?(?P<id>[0-9]+)(?:[/?&]|$)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', 'md5': '1c1e75d22ffa53320f45eeb07bc4cdc0', 'info_dict': { 'id': '1214711', + 'display_id': 'petite-asian-lady-mai-playing-in-bathtub', 'ext': 'mp4', 'title': 'Petite Asian Lady Mai Playing In Bathtub', + 'thumbnail': r're:^https?://.*\.jpg$', + 'view_count': int, 'age_limit': 18, - 'thumbnail': 're:^https?://.*\.jpg$', } - } + }, { + 'url': 'http://www.keezmovies.com/video/1214711', + 'only_matching': True, + }] - def _real_extract(self, url): - video_id = self._match_id(url) + def _extract_info(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = (mobj.group('display_id') + if 'display_id' in mobj.groupdict() + else None) or mobj.group('id') - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) - - # embedded video - mobj = re.search(r'href="([^"]+)"></iframe>', webpage) - if mobj: - embedded_url = mobj.group(1) - return self.url_result(embedded_url) - - video_title = self._html_search_regex( - r'<h1 [^>]*>([^<]+)', webpage, 'title') - flashvars = self._parse_json(self._search_regex( - r'var\s+flashvars\s*=\s*([^;]+);', webpage, 'flashvars'), video_id) + webpage = self._download_webpage( + url, display_id, headers={'Cookie': 'age_verified=1'}) formats = [] - for height in (180, 240, 480): - if flashvars.get('quality_%dp' % height): - video_url = flashvars['quality_%dp' % height] - a_format = { - 'url': video_url, - 'height': height, - 'format_id': '%dp' % height, - } - filename_parts = url_basename(video_url).split('_') - if len(filename_parts) >= 2 and re.match(r'\d+[Kk]', filename_parts[1]): - a_format['tbr'] = int(filename_parts[1][:-1]) - formats.append(a_format) + format_urls = set() - age_limit = self._rta_search(webpage) + title = None + thumbnail = None + duration = None + encrypted = False - return { + def extract_format(format_url, height=None): + if not isinstance(format_url, compat_str) or not format_url.startswith('http'): + return + if format_url in format_urls: + return + format_urls.add(format_url) + tbr = int_or_none(self._search_regex( + r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) + if not height: + height = int_or_none(self._search_regex( + r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) + if encrypted: + format_url = aes_decrypt_text( + video_url, title, 32).decode('utf-8') + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + + flashvars = self._parse_json( + self._search_regex( + r'flashvars\s*=\s*({.+?});', webpage, + 'flashvars', default='{}'), + display_id, fatal=False) + + if flashvars: + title = flashvars.get('video_title') + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + encrypted = flashvars.get('encrypted') is True + for key, value in flashvars.items(): + mobj = re.search(r'quality_(\d+)[pP]', key) + if mobj: + extract_format(value, int(mobj.group(1))) + video_url = flashvars.get('video_url') + if video_url and determine_ext(video_url, None): + extract_format(video_url) + + video_url = self._html_search_regex( + r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1', + webpage, 'video url', default=None, group='url') + if video_url: + extract_format(compat_urllib_parse_unquote(video_url)) + + if not formats: + if 'title="This video is no longer available"' in webpage: + raise ExtractorError( + 'Video %s is no longer available' % video_id, expected=True) + + self._sort_formats(formats) + + if not title: + title = self._html_search_regex( + r'<h1[^>]*>([^<]+)', webpage, 'title') + + return webpage, { 'id': video_id, - 'title': video_title, + 'display_id': display_id, + 'title': strip_or_none(title), + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': 18, 'formats': formats, - 'age_limit': age_limit, - 'thumbnail': flashvars.get('image_url') } + + def _real_extract(self, url): + webpage, info = self._extract_info(url) + info['view_count'] = str_to_int(self._search_regex( + r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False)) + return info diff --git a/youtube_dl/extractor/ketnet.py b/youtube_dl/extractor/ketnet.py new file mode 100644 index 000000000..fb9c2dbd4 --- /dev/null +++ b/youtube_dl/extractor/ketnet.py @@ -0,0 +1,72 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class KetnetIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes', + 'md5': 'd907f7b1814ef0fa285c0475d9994ed7', + 'info_dict': { + 'id': 'zomerse-filmpjes', + 'ext': 'mp4', + 'title': 'Gluur mee op de filmset en op Pennenzakkenrock', + 'description': 'Gluur mee met Ghost Rockers op de filmset', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016', + 'only_matching': True, + }, { + 'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life', + 'only_matching': True, + }, { + # mzsource, geo restricted to Belgium + 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-bermadoe', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + config = self._parse_json( + self._search_regex( + r'(?s)playerConfig\s*=\s*({.+?})\s*;', webpage, + 'player config'), + video_id) + + title = config['title'] + + formats = [] + for source_key in ('', 'mz'): + source = config.get('%ssource' % source_key) + if not isinstance(source, dict): + continue + for format_id, format_url in source.items(): + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + fatal=False)) + elif format_id == 'hds': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': config.get('description'), + 'thumbnail': config.get('image'), + 'series': config.get('program'), + 'episode': config.get('episode'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 9f1ade2e4..d4da8f484 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -6,7 +6,7 @@ from ..utils import smuggle_url class KickStarterIE(InfoExtractor): - _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>[^/]*)/.*' + _VALID_URL = r'https?://(?:www\.)?kickstarter\.com/projects/(?P<id>[^/]*)/.*' _TESTS = [{ 'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant/description', 'md5': 'c81addca81327ffa66c642b5d8b08cab', @@ -37,7 +37,6 @@ class KickStarterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Power Drive 2000', }, - 'expected_warnings': ['OpenGraph description'], }] def _real_extract(self, url): @@ -67,6 +66,6 @@ class KickStarterIE(InfoExtractor): 'id': video_id, 'url': video_url, 'title': title, - 'description': self._og_search_description(webpage), + 'description': self._og_search_description(webpage, default=None), 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/konserthusetplay.py b/youtube_dl/extractor/konserthusetplay.py index 55291c66f..c11cbcf47 100644 --- a/youtube_dl/extractor/konserthusetplay.py +++ b/youtube_dl/extractor/konserthusetplay.py @@ -2,29 +2,31 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + determine_ext, float_or_none, int_or_none, ) class KonserthusetPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?konserthusetplay\.se/\?.*\bm=(?P<id>[^&]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:konserthusetplay|rspoplay)\.se/\?.*\bm=(?P<id>[^&]+)' + _TESTS = [{ 'url': 'http://www.konserthusetplay.se/?m=CKDDnlCY-dhWAAqiMERd-A', + 'md5': 'e3fd47bf44e864bd23c08e487abe1967', 'info_dict': { 'id': 'CKDDnlCY-dhWAAqiMERd-A', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Orkesterns instrument: Valthornen', 'description': 'md5:f10e1f0030202020396a4d712d2fa827', 'thumbnail': 're:^https?://.*$', - 'duration': 398.8, + 'duration': 398.76, }, - 'params': { - # rtmp download - 'skip_download': True, - }, - } + }, { + 'url': 'http://rspoplay.se/?m=elWuEH34SMKvaO4wO_cHBw', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -42,12 +44,18 @@ class KonserthusetPlayIE(InfoExtractor): player_config = media['playerconfig'] playlist = player_config['playlist'] - source = next(f for f in playlist if f.get('bitrates')) + source = next(f for f in playlist if f.get('bitrates') or f.get('provider')) FORMAT_ID_REGEX = r'_([^_]+)_h264m\.mp4' formats = [] + m3u8_url = source.get('url') + if m3u8_url and determine_ext(m3u8_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + fallback_url = source.get('fallbackUrl') fallback_format_id = None if fallback_url: @@ -97,6 +105,13 @@ class KonserthusetPlayIE(InfoExtractor): thumbnail = media.get('image') duration = float_or_none(media.get('duration'), 1000) + subtitles = {} + captions = source.get('captionsAvailableLanguages') + if isinstance(captions, dict): + for lang, subtitle_url in captions.items(): + if lang != 'none' and isinstance(subtitle_url, compat_str): + subtitles.setdefault(lang, []).append({'url': subtitle_url}) + return { 'id': video_id, 'title': title, @@ -104,4 +119,5 @@ class KonserthusetPlayIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index 704bd7b34..1fda45107 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py index 0ae8ebd68..d27d052ff 100644 --- a/youtube_dl/extractor/krasview.py +++ b/youtube_dl/extractor/krasview.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import json @@ -23,7 +23,7 @@ class KrasViewIE(InfoExtractor): 'title': 'Снег, лёд, заносы', 'description': 'Снято в городе Нягань, в Ханты-Мансийском автономном округе.', 'duration': 27, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', }, 'params': { 'skip_download': 'Not accessible from Travis CI server', diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py index 12cc56e44..6a7e3baa7 100644 --- a/youtube_dl/extractor/kusi.py +++ b/youtube_dl/extractor/kusi.py @@ -18,31 +18,20 @@ from ..utils import ( class KUSIIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' _TESTS = [{ - 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', - 'md5': 'f926e7684294cf8cb7bdf8858e1b3988', + 'url': 'http://www.kusi.com/story/32849881/turko-files-refused-to-help-it-aint-right', + 'md5': '4e76ce8e53660ce9697d06c0ba6fc47d', 'info_dict': { - 'id': '12203019', + 'id': '12689020', 'ext': 'mp4', - 'title': 'Turko Files: Case Closed! & Put On Hold!', - 'duration': 231.0, - 'upload_date': '20160210', - 'timestamp': 1455087571, - 'thumbnail': 're:^https?://.*\.jpg$' + 'title': "Turko Files: Refused to Help, It Ain't Right!", + 'duration': 223.586, + 'upload_date': '20160826', + 'timestamp': 1472233118, + 'thumbnail': r're:^https?://.*\.jpg$' }, }, { 'url': 'http://kusi.com/video?clipId=12203019', - 'info_dict': { - 'id': '12203019', - 'ext': 'mp4', - 'title': 'Turko Files: Case Closed! & Put On Hold!', - 'duration': 231.0, - 'upload_date': '20160210', - 'timestamp': 1455087571, - 'thumbnail': 're:^https?://.*\.jpg$' - }, - 'params': { - 'skip_download': True, # Same as previous one - }, + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index b1d460599..63e10125e 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( get_element_by_id, clean_html, @@ -58,7 +59,7 @@ class KuwoBaseIE(InfoExtractor): class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' - _VALID_URL = r'https?://www\.kuwo\.cn/yinyue/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/yinyue/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { @@ -81,7 +82,7 @@ class KuwoIE(KuwoBaseIE): 'upload_date': '20150518', }, 'params': { - 'format': 'mp3-320' + 'format': 'mp3-320', }, }, { 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', @@ -90,10 +91,10 @@ class KuwoIE(KuwoBaseIE): def _real_extract(self, url): song_id = self._match_id(url) - webpage = self._download_webpage( + webpage, urlh = self._download_webpage_handle( url, song_id, note='Download song detail info', errnote='Unable to get song detail info') - if '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: + if song_id not in urlh.geturl() or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: raise ExtractorError('this song has been offline because of copyright issues', expected=True) song_name = self._html_search_regex( @@ -138,7 +139,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' IE_DESC = '酷我音乐 - 专辑' - _VALID_URL = r'https?://www\.kuwo\.cn/album/(?P<id>\d+?)/' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/album/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', 'info_dict': { @@ -180,7 +181,7 @@ class KuwoChartIE(InfoExtractor): 'info_dict': { 'id': '香港中文龙虎榜', }, - 'playlist_mincount': 10, + 'playlist_mincount': 7, } def _real_extract(self, url): @@ -199,7 +200,7 @@ class KuwoChartIE(InfoExtractor): class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' IE_DESC = '酷我音乐 - 歌手' - _VALID_URL = r'https?://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mingxing/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 'info_dict': { @@ -242,8 +243,9 @@ class KuwoSingerIE(InfoExtractor): query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) return [ - self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<div[^>]+class="name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', + self.url_result(compat_urlparse.urljoin(url, song_url), 'Kuwo') + for song_url in re.findall( + r'<div[^>]+class="name"><a[^>]+href="(/yinyue/\d+)', webpage) ] @@ -294,14 +296,14 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' IE_DESC = '酷我音乐 - MV' - _VALID_URL = r'https?://www\.kuwo\.cn/mv/(?P<id>\d+?)/' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mv/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { 'id': '6480076', 'ext': 'mp4', 'title': 'My HouseMV', - 'creator': 'PM02:00', + 'creator': '2PM', }, # In this video, music URLs (anti.s) are blocked outside China and # USA, while the MV URL (mvurl) is available globally, so force the MV diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 2fab38079..3190b187c 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -1,25 +1,115 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, -) from ..utils import ( ExtractorError, - sanitized_Request, unified_strdate, urlencode_postdata, xpath_element, xpath_text, + urljoin, + update_url_query, ) +class Laola1TvEmbedIE(InfoExtractor): + IE_NAME = 'laola1tv:embed' + _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)' + _TEST = { + # flashvars.premium = "false"; + 'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024', + 'info_dict': { + 'id': '708065', + 'ext': 'mp4', + 'title': 'MA Long CHN - FAN Zhendong CHN', + 'uploader': 'ITTF - International Table Tennis Federation', + 'upload_date': '20161211', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + flash_vars = self._search_regex( + r'(?s)flashvars\s*=\s*({.+?});', webpage, 'flash vars') + + def get_flashvar(x, *args, **kwargs): + flash_var = self._search_regex( + r'%s\s*:\s*"([^"]+)"' % x, + flash_vars, x, default=None) + if not flash_var: + flash_var = self._search_regex([ + r'flashvars\.%s\s*=\s*"([^"]+)"' % x, + r'%s\s*=\s*"([^"]+)"' % x], + webpage, x, *args, **kwargs) + return flash_var + + hd_doc = self._download_xml( + 'http://www.laola1.tv/server/hd_video.php', video_id, query={ + 'play': get_flashvar('streamid'), + 'partner': get_flashvar('partnerid'), + 'portal': get_flashvar('portalid'), + 'lang': get_flashvar('sprache'), + 'v5ident': '', + }) + + _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) + title = _v('title', fatal=True) + + token_url = None + premium = get_flashvar('premium', default=None) + if premium: + token_url = update_url_query( + _v('url', fatal=True), { + 'timestamp': get_flashvar('timestamp'), + 'auth': get_flashvar('auth'), + }) + else: + data_abo = urlencode_postdata( + dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(',')))) + token_url = self._download_json( + 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', + video_id, query={ + 'videoId': _v('id'), + 'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'), + 'label': _v('label'), + 'area': _v('area'), + }, data=data_abo)['data']['stream-access'][0] + + token_doc = self._download_xml( + token_url, video_id, 'Downloading token', + headers=self.geo_verification_headers()) + + token_attrib = xpath_element(token_doc, './/token').attrib + + if token_attrib['status'] != '0': + raise ExtractorError( + 'Token error: %s' % token_attrib['comment'], expected=True) + + formats = self._extract_akamai_formats( + '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), + video_id) + self._sort_formats(formats) + + categories_str = _v('meta_sports') + categories = categories_str.split(',') if categories_str else [] + is_live = _v('islive') == 'true' + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'upload_date': unified_strdate(_v('time_date')), + 'uploader': _v('meta_organisation'), + 'categories': categories, + 'is_live': is_live, + 'formats': formats, + } + + class Laola1TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/(?P<kind>[^/]+)/(?P<slug>[^/?#&]+)' + IE_NAME = 'laola1tv' + _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -67,85 +157,20 @@ class Laola1TvIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('slug') - kind = mobj.group('kind') - lang = mobj.group('lang') - portal = mobj.group('portal') + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) if 'Dieser Livestream ist bereits beendet.' in webpage: raise ExtractorError('This live stream has already finished.', expected=True) - iframe_url = self._search_regex( + iframe_url = urljoin(url, self._search_regex( r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"', - webpage, 'iframe url') - - video_id = self._search_regex( - r'videoid=(\d+)', iframe_url, 'video id') - - iframe = self._download_webpage(compat_urlparse.urljoin( - url, iframe_url), display_id, 'Downloading iframe') - - partner_id = self._search_regex( - r'partnerid\s*:\s*(["\'])(?P<partner_id>.+?)\1', - iframe, 'partner id', group='partner_id') - - hd_doc = self._download_xml( - 'http://www.laola1.tv/server/hd_video.php?%s' - % compat_urllib_parse_urlencode({ - 'play': video_id, - 'partner': partner_id, - 'portal': portal, - 'lang': lang, - 'v5ident': '', - }), display_id) - - _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) - title = _v('title', fatal=True) - - VS_TARGETS = { - 'video': '2', - 'livestream': '17', - } - - req = sanitized_Request( - 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access?%s' % - compat_urllib_parse_urlencode({ - 'videoId': video_id, - 'target': VS_TARGETS.get(kind, '2'), - 'label': _v('label'), - 'area': _v('area'), - }), - urlencode_postdata( - dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(','))))) - - token_url = self._download_json(req, display_id)['data']['stream-access'][0] - token_doc = self._download_xml(token_url, display_id, 'Downloading token') - - token_attrib = xpath_element(token_doc, './/token').attrib - token_auth = token_attrib['auth'] - - if token_auth in ('blocked', 'restricted', 'error'): - raise ExtractorError( - 'Token error: %s' % token_attrib['comment'], expected=True) - - formats = self._extract_f4m_formats( - '%s?hdnea=%s&hdcore=3.2.0' % (token_attrib['url'], token_auth), - video_id, f4m_id='hds') - self._sort_formats(formats) - - categories_str = _v('meta_sports') - categories = categories_str.split(',') if categories_str else [] + webpage, 'iframe url')) return { - 'id': video_id, + '_type': 'url', 'display_id': display_id, - 'title': title, - 'upload_date': unified_strdate(_v('time_date')), - 'uploader': _v('meta_organisation'), - 'categories': categories, - 'is_live': _v('islive') == 'true', - 'formats': formats, + 'url': iframe_url, + 'ie_key': 'Laola1TvEmbed', } diff --git a/youtube_dl/extractor/lci.py b/youtube_dl/extractor/lci.py new file mode 100644 index 000000000..af34829e7 --- /dev/null +++ b/youtube_dl/extractor/lci.py @@ -0,0 +1,24 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class LCIIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lci\.fr/[^/]+/[\w-]+-(?P<id>\d+)\.html' + _TEST = { + 'url': 'http://www.lci.fr/international/etats-unis-a-j-62-hillary-clinton-reste-sans-voix-2001679.html', + 'md5': '2fdb2538b884d4d695f9bd2bde137e6c', + 'info_dict': { + 'id': '13244802', + 'ext': 'mp4', + 'title': 'Hillary Clinton et sa quinte de toux, en plein meeting', + 'description': 'md5:a4363e3a960860132f8124b62f4a01c9', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + wat_id = self._search_regex(r'data-watid=[\'"](\d+)', webpage, 'wat id') + return self.url_result('wat:' + wat_id, 'Wat', wat_id) diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index e9cc9aa59..4321f90c8 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -29,7 +29,7 @@ from ..utils import ( class LeIE(InfoExtractor): IE_DESC = '乐视网' - _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|sports\.le\.com/video)/(?P<id>\d+)\.html' + _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|(?:sports\.le|(?:www\.)?lesports)\.com/(?:match|video))/(?P<id>\d+)\.html' _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' @@ -73,6 +73,12 @@ class LeIE(InfoExtractor): }, { 'url': 'http://sports.le.com/video/25737697.html', 'only_matching': True, + }, { + 'url': 'http://www.lesports.com/match/1023203003.html', + 'only_matching': True, + }, { + 'url': 'http://sports.le.com/match/1023203003.html', + 'only_matching': True, }] # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf @@ -380,8 +386,8 @@ class LetvCloudIE(InfoExtractor): return formats def _real_extract(self, url): - uu_mobj = re.search('uu=([\w]+)', url) - vu_mobj = re.search('vu=([\w]+)', url) + uu_mobj = re.search(r'uu=([\w]+)', url) + vu_mobj = re.search(r'vu=([\w]+)', url) if not uu_mobj or not vu_mobj: raise ExtractorError('Invalid URL: %s' % url, expected=True) diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py new file mode 100644 index 000000000..d3bca6435 --- /dev/null +++ b/youtube_dl/extractor/lego.py @@ -0,0 +1,128 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + unescapeHTML, + parse_duration, + get_element_by_class, +) + + +class LEGOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lego\.com/(?P<locale>[^/]+)/(?:[^/]+/)*videos/(?:[^/]+/)*[^/?#]+-(?P<id>[0-9a-f]+)' + _TESTS = [{ + 'url': 'http://www.lego.com/en-us/videos/themes/club/blocumentary-kawaguchi-55492d823b1b4d5e985787fa8c2973b1', + 'md5': 'f34468f176cfd76488767fc162c405fa', + 'info_dict': { + 'id': '55492d823b1b4d5e985787fa8c2973b1', + 'ext': 'mp4', + 'title': 'Blocumentary Great Creations: Akiyuki Kawaguchi', + 'description': 'Blocumentary Great Creations: Akiyuki Kawaguchi', + }, + }, { + # geo-restricted but the contentUrl contain a valid url + 'url': 'http://www.lego.com/nl-nl/videos/themes/nexoknights/episode-20-kingdom-of-heroes-13bdc2299ab24d9685701a915b3d71e7##sp=399', + 'md5': '4c3fec48a12e40c6e5995abc3d36cc2e', + 'info_dict': { + 'id': '13bdc2299ab24d9685701a915b3d71e7', + 'ext': 'mp4', + 'title': 'Aflevering 20 - Helden van het koninkrijk', + 'description': 'md5:8ee499aac26d7fa8bcb0cedb7f9c3941', + }, + }, { + # special characters in title + 'url': 'http://www.lego.com/en-us/starwars/videos/lego-star-wars-force-surprise-9685ee9d12e84ff38e84b4e3d0db533d', + 'info_dict': { + 'id': '9685ee9d12e84ff38e84b4e3d0db533d', + 'ext': 'mp4', + 'title': 'Force Surprise – LEGO® Star Wars™ Microfighters', + 'description': 'md5:9c673c96ce6f6271b88563fe9dc56de3', + }, + 'params': { + 'skip_download': True, + }, + }] + _BITRATES = [256, 512, 1024, 1536, 2560] + + def _real_extract(self, url): + locale, video_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, video_id) + title = get_element_by_class('video-header', webpage).strip() + progressive_base = 'https://lc-mediaplayerns-live-s.legocdn.com/' + streaming_base = 'http://legoprod-f.akamaihd.net/' + content_url = self._html_search_meta('contentUrl', webpage) + path = self._search_regex( + r'(?:https?:)?//[^/]+/(?:[iz]/s/)?public/(.+)_[0-9,]+\.(?:mp4|webm)', + content_url, 'video path', default=None) + if not path: + player_url = self._proto_relative_url(self._search_regex( + r'<iframe[^>]+src="((?:https?)?//(?:www\.)?lego\.com/[^/]+/mediaplayer/video/[^"]+)', + webpage, 'player url', default=None)) + if not player_url: + base_url = self._proto_relative_url(self._search_regex( + r'data-baseurl="([^"]+)"', webpage, 'base url', + default='http://www.lego.com/%s/mediaplayer/video/' % locale)) + player_url = base_url + video_id + player_webpage = self._download_webpage(player_url, video_id) + video_data = self._parse_json(unescapeHTML(self._search_regex( + r"video='([^']+)'", player_webpage, 'video data')), video_id) + progressive_base = self._search_regex( + r'data-video-progressive-url="([^"]+)"', + player_webpage, 'progressive base', default='https://lc-mediaplayerns-live-s.legocdn.com/') + streaming_base = self._search_regex( + r'data-video-streaming-url="([^"]+)"', + player_webpage, 'streaming base', default='http://legoprod-f.akamaihd.net/') + item_id = video_data['ItemId'] + + net_storage_path = video_data.get('NetStoragePath') or '/'.join([item_id[:2], item_id[2:4]]) + base_path = '_'.join([item_id, video_data['VideoId'], video_data['Locale'], compat_str(video_data['VideoVersion'])]) + path = '/'.join([net_storage_path, base_path]) + streaming_path = ','.join(map(lambda bitrate: compat_str(bitrate), self._BITRATES)) + + formats = self._extract_akamai_formats( + '%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id) + m3u8_formats = list(filter( + lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + formats)) + if len(m3u8_formats) == len(self._BITRATES): + self._sort_formats(m3u8_formats) + for bitrate, m3u8_format in zip(self._BITRATES, m3u8_formats): + progressive_base_url = '%spublic/%s_%d.' % (progressive_base, path, bitrate) + mp4_f = m3u8_format.copy() + mp4_f.update({ + 'url': progressive_base_url + 'mp4', + 'format_id': m3u8_format['format_id'].replace('hls', 'mp4'), + 'protocol': 'http', + }) + web_f = { + 'url': progressive_base_url + 'webm', + 'format_id': m3u8_format['format_id'].replace('hls', 'webm'), + 'width': m3u8_format['width'], + 'height': m3u8_format['height'], + 'tbr': m3u8_format.get('tbr'), + 'ext': 'webm', + } + formats.extend([web_f, mp4_f]) + else: + for bitrate in self._BITRATES: + for ext in ('web', 'mp4'): + formats.append({ + 'format_id': '%s-%s' % (ext, bitrate), + 'url': '%spublic/%s_%d.%s' % (progressive_base, path, bitrate, ext), + 'tbr': bitrate, + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta('description', webpage), + 'thumbnail': self._html_search_meta('thumbnail', webpage), + 'duration': parse_duration(self._html_search_meta('duration', webpage)), + 'formats': formats, + } diff --git a/youtube_dl/extractor/lemonde.py b/youtube_dl/extractor/lemonde.py index be66fff03..42568f315 100644 --- a/youtube_dl/extractor/lemonde.py +++ b/youtube_dl/extractor/lemonde.py @@ -12,7 +12,7 @@ class LemondeIE(InfoExtractor): 'id': 'lqm3kl', 'ext': 'mp4', 'title': "Comprendre l'affaire Bygmalion en 5 minutes", - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 320, 'upload_date': '20160119', 'timestamp': 1453194778, diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 0a94366fd..40295a30b 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -25,7 +25,7 @@ class LibraryOfCongressIE(InfoExtractor): 'id': '90716351', 'ext': 'mp4', 'title': "Pa's trip to Mars", - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 0, 'view_count': int, }, diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index d375695f5..4750b03a3 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -41,7 +41,7 @@ class LibsynIE(InfoExtractor): formats = [{ 'url': media_url, - } for media_url in set(re.findall('var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))] + } for media_url in set(re.findall(r'var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))] podcast_title = self._search_regex( r'<h2>([^<]+)</h2>', webpage, 'podcast title', default=None) diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index c2b4490c4..42e263bfa 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -1,10 +1,13 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( determine_ext, ExtractorError, @@ -96,7 +99,7 @@ class LifeNewsIE(InfoExtractor): r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage) iframe_links = re.findall( - r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']', + r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/(?:embed|video)/.+?)["\']', webpage) if not video_urls and not iframe_links: @@ -164,41 +167,68 @@ class LifeNewsIE(InfoExtractor): class LifeEmbedIE(InfoExtractor): IE_NAME = 'life:embed' - _VALID_URL = r'https?://embed\.life\.ru/embed/(?P<id>[\da-f]{32})' + _VALID_URL = r'https?://embed\.life\.ru/(?:embed|video)/(?P<id>[\da-f]{32})' - _TEST = { + _TESTS = [{ 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', 'md5': 'b889715c9e49cb1981281d0e5458fbbe', 'info_dict': { 'id': 'e50c2dec2867350528e2574c899b8291', 'ext': 'mp4', 'title': 'e50c2dec2867350528e2574c899b8291', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', } - } + }, { + # with 1080p + 'url': 'https://embed.life.ru/video/e50c2dec2867350528e2574c899b8291', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + thumbnail = None formats = [] - for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): - video_url = compat_urlparse.urljoin(url, video_url) - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='m3u8')) - else: - formats.append({ - 'url': video_url, - 'format_id': ext, - 'preference': 1, - }) + + def extract_m3u8(manifest_url): + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='m3u8')) + + def extract_original(original_url): + formats.append({ + 'url': original_url, + 'format_id': determine_ext(original_url, None), + 'preference': 1, + }) + + playlist = self._parse_json( + self._search_regex( + r'options\s*=\s*({.+?});', webpage, 'options', default='{}'), + video_id).get('playlist', {}) + if playlist: + master = playlist.get('master') + if isinstance(master, compat_str) and determine_ext(master) == 'm3u8': + extract_m3u8(compat_urlparse.urljoin(url, master)) + original = playlist.get('original') + if isinstance(original, compat_str): + extract_original(original) + thumbnail = playlist.get('image') + + # Old rendition fallback + if not formats: + for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): + video_url = compat_urlparse.urljoin(url, video_url) + if determine_ext(video_url) == 'm3u8': + extract_m3u8(video_url) + else: + extract_original(video_url) + self._sort_formats(formats) - thumbnail = self._search_regex( + thumbnail = thumbnail or self._search_regex( r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None) return { diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index a425bafe3..e635f3c4d 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -34,11 +34,12 @@ class LimelightBaseIE(InfoExtractor): def _extract_info(self, streams, mobile_urls, properties): video_id = properties['media_id'] formats = [] - + urls = [] for stream in streams: stream_url = stream.get('url') - if not stream_url or stream.get('drmProtected'): + if not stream_url or stream.get('drmProtected') or stream_url in urls: continue + urls.append(stream_url) ext = determine_ext(stream_url) if ext == 'f4m': formats.extend(self._extract_f4m_formats( @@ -58,12 +59,26 @@ class LimelightBaseIE(InfoExtractor): format_id = 'rtmp' if stream.get('videoBitRate'): format_id += '-%d' % int_or_none(stream['videoBitRate']) - http_fmt = fmt.copy() - http_fmt.update({ - 'url': 'http://%s/%s' % (rtmp.group('host').replace('csl.', 'cpl.'), rtmp.group('playpath')[4:]), - 'format_id': format_id.replace('rtmp', 'http'), - }) - formats.append(http_fmt) + http_format_id = format_id.replace('rtmp', 'http') + + CDN_HOSTS = ( + ('delvenetworks.com', 'cpl.delvenetworks.com'), + ('video.llnw.net', 's2.content.video.llnw.net'), + ) + for cdn_host, http_host in CDN_HOSTS: + if cdn_host not in rtmp.group('host').lower(): + continue + http_url = 'http://%s/%s' % (http_host, rtmp.group('playpath')[4:]) + urls.append(http_url) + if self._is_valid_url(http_url, video_id, http_format_id): + http_fmt = fmt.copy() + http_fmt.update({ + 'url': http_url, + 'format_id': http_format_id, + }) + formats.append(http_fmt) + break + fmt.update({ 'url': rtmp.group('url'), 'play_path': rtmp.group('playpath'), @@ -76,8 +91,9 @@ class LimelightBaseIE(InfoExtractor): for mobile_url in mobile_urls: media_url = mobile_url.get('mobileUrl') format_id = mobile_url.get('targetMediaPlatform') - if not media_url or format_id == 'Widevine': + if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls: continue + urls.append(media_url) ext = determine_ext(media_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -160,7 +176,7 @@ class LimelightMediaIE(LimelightBaseIE): 'ext': 'mp4', 'title': 'HaP and the HB Prince Trailer', 'description': 'md5:8005b944181778e313d95c1237ddb640', - 'thumbnail': 're:^https?://.*\.jpeg$', + 'thumbnail': r're:^https?://.*\.jpeg$', 'duration': 144.23, 'timestamp': 1244136834, 'upload_date': '20090604', @@ -177,7 +193,7 @@ class LimelightMediaIE(LimelightBaseIE): 'id': 'a3e00274d4564ec4a9b29b9466432335', 'ext': 'mp4', 'title': '3Play Media Overview Video', - 'thumbnail': 're:^https?://.*\.jpeg$', + 'thumbnail': r're:^https?://.*\.jpeg$', 'duration': 78.101, 'timestamp': 1338929955, 'upload_date': '20120605', diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py index 3356d015d..337b1b15c 100644 --- a/youtube_dl/extractor/litv.py +++ b/youtube_dl/extractor/litv.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import json -import re from .common import InfoExtractor from ..utils import ( @@ -14,7 +13,7 @@ from ..utils import ( class LiTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P<id>[^&]+)' + _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P<id>[^&]+)' _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' @@ -27,23 +26,33 @@ class LiTVIE(InfoExtractor): 'playlist_count': 50, }, { 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'md5': '969e343d9244778cb29acec608e53640', 'info_dict': { 'id': 'VOD00041610', 'ext': 'mp4', 'title': '花千骨第1集', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', 'episode_number': 1, }, 'params': { 'noplaylist': True, - 'skip_download': True, # m3u8 download + }, + 'skip': 'Georestricted to Taiwan', + }, { + 'url': 'https://www.litv.tv/promo/miyuezhuan/?content_id=VOD00044841&', + 'md5': '88322ea132f848d6e3e18b32a832b918', + 'info_dict': { + 'id': 'VOD00044841', + 'ext': 'mp4', + 'title': '芈月傳第1集 霸星芈月降世楚國', + 'description': '楚威王二年,太史令唐昧夜觀星象,發現霸星即將現世。王后得知霸星的預言後,想盡辦法不讓孩子順利出生,幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主,楚威王對此失望至極。楚王后命人將女嬰丟棄河中,居然奇蹟似的被少司命像攔下,楚威王認為此女非同凡響,為她取名芈月。', }, 'skip': 'Georestricted to Taiwan', }] - def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True): - episode_title = view_data['title'] + def _extract_playlist(self, season_list, video_id, program_info, prompt=True): + episode_title = program_info['title'] content_id = season_list['contentId'] if prompt: @@ -51,7 +60,7 @@ class LiTVIE(InfoExtractor): all_episodes = [ self.url_result(smuggle_url( - self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']), + self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']), {'force_noplaylist': True})) # To prevent infinite recursion for episode in season_list['episode']] @@ -70,19 +79,15 @@ class LiTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - view_data = dict(map(lambda t: (t[0], t[2]), re.findall( - r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2', - webpage))) - - vod_data = self._parse_json(self._search_regex( - 'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), + program_info = self._parse_json(self._search_regex( + r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), video_id) - season_list = list(vod_data.get('seasonList', {}).values()) + season_list = list(program_info.get('seasonList', {}).values()) if season_list: if not noplaylist: return self._extract_playlist( - season_list[0], video_id, vod_data, view_data, + season_list[0], video_id, program_info, prompt=noplaylist_prompt) if noplaylist_prompt: @@ -92,14 +97,19 @@ class LiTVIE(InfoExtractor): # endpoint gives the same result as the data embedded in the webpage. # If georestricted, there are no embedded data, so an extra request is # necessary to get the error code + if 'assetId' not in program_info: + program_info = self._download_json( + 'https://www.litv.tv/vod/ajax/getProgramInfo', video_id, + query={'contentId': video_id}, + headers={'Accept': 'application/json'}) video_data = self._parse_json(self._search_regex( r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', webpage, 'video data', default='{}'), video_id) if not video_data: payload = { - 'assetId': view_data['assetId'], - 'watchDevices': vod_data['watchDevices'], - 'contentType': view_data['contentType'], + 'assetId': program_info['assetId'], + 'watchDevices': program_info['watchDevices'], + 'contentType': program_info['contentType'], } video_data = self._download_json( 'https://www.litv.tv/vod/getMainUrl', video_id, @@ -115,16 +125,17 @@ class LiTVIE(InfoExtractor): raise ExtractorError('Unexpected result from %s' % self.IE_NAME) formats = self._extract_m3u8_formats( - video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls') + video_data['fullpath'], video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') for a_format in formats: # LiTV HLS segments doesn't like compressions a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True - title = view_data['title'] + view_data.get('secondaryMark', '') - description = view_data.get('description') - thumbnail = view_data.get('imageFile') - categories = [item['name'] for item in vod_data.get('category', [])] - episode = int_or_none(view_data.get('episode')) + title = program_info['title'] + program_info.get('secondaryMark', '') + description = program_info.get('description') + thumbnail = program_info.get('imageFile') + categories = [item['name'] for item in program_info.get('category', [])] + episode = int_or_none(program_info.get('episode')) return { 'id': video_id, diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index ea0565ac0..c7de65353 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -18,7 +18,7 @@ class LiveLeakIE(InfoExtractor): 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', 'title': 'Most unlucky car accident', - 'thumbnail': 're:^https?://.*\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg$' } }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', @@ -29,7 +29,7 @@ class LiveLeakIE(InfoExtractor): 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', 'uploader': 'ARD_Stinkt', 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', - 'thumbnail': 're:^https?://.*\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg$' } }, { 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', @@ -52,8 +52,24 @@ class LiveLeakIE(InfoExtractor): 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.', 'uploader': 'bony333', 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia', - 'thumbnail': 're:^https?://.*\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg$' } + }, { + # Covers https://github.com/rg3/youtube-dl/pull/10664#issuecomment-247439521 + 'url': 'http://m.liveleak.com/view?i=763_1473349649', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': '763_1473349649', + 'ext': 'mp4', + 'title': 'Reporters and public officials ignore epidemic of black on asian violence in Sacramento | Colin Flaherty', + 'description': 'Colin being the warrior he is and showing the injustice Asians in Sacramento are being subjected to.', + 'uploader': 'Ziz', + 'upload_date': '20160908', + 'uploader_id': 'UCEbta5E_jqlZmEJsriTEtnw' + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -87,7 +103,7 @@ class LiveLeakIE(InfoExtractor): else: # Maybe an embed? embed_url = self._search_regex( - r'<iframe[^>]+src="(http://www.prochan.com/embed\?[^"]+)"', + r'<iframe[^>]+src="(https?://(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', webpage, 'embed URL') return { '_type': 'url_transparent', @@ -107,6 +123,7 @@ class LiveLeakIE(InfoExtractor): 'format_note': s.get('label'), 'url': s['file'], } for i, s in enumerate(sources)] + for i, s in enumerate(sources): # Removing '.h264_*.mp4' gives the raw video, which is essentially # the same video without the LiveLeak logo at the top (see diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index bc7894bf1..c863413bf 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -37,7 +37,7 @@ class LivestreamIE(InfoExtractor): 'duration': 5968.0, 'like_count': int, 'view_count': int, - 'thumbnail': 're:^http://.*\.jpg$' + 'thumbnail': r're:^http://.*\.jpg$' } }, { 'url': 'http://new.livestream.com/tedx/cityenglish', diff --git a/youtube_dl/extractor/lnkgo.py b/youtube_dl/extractor/lnkgo.py index fd23b0b43..068378c9c 100644 --- a/youtube_dl/extractor/lnkgo.py +++ b/youtube_dl/extractor/lnkgo.py @@ -22,7 +22,7 @@ class LnkGoIE(InfoExtractor): 'description': 'md5:d82a5e36b775b7048617f263a0e3475e', 'age_limit': 7, 'duration': 3019, - 'thumbnail': 're:^https?://.*\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg$' }, 'params': { 'skip_download': True, # HLS download @@ -37,7 +37,7 @@ class LnkGoIE(InfoExtractor): 'description': 'md5:7352d113a242a808676ff17e69db6a69', 'age_limit': 18, 'duration': 346, - 'thumbnail': 're:^https?://.*\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg$' }, 'params': { 'skip_download': True, # HLS download diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index 1072405b3..f5c997ef4 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, parse_duration, remove_end, @@ -12,8 +15,10 @@ from ..utils import ( class LRTIE(InfoExtractor): IE_NAME = 'lrt.lt' _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' - _TEST = { + _TESTS = [{ + # m3u8 download 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', + 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', 'info_dict': { 'id': '54391', 'ext': 'mp4', @@ -23,20 +28,45 @@ class LRTIE(InfoExtractor): 'view_count': int, 'like_count': int, }, - 'params': { - 'skip_download': True, # m3u8 download + }, { + # direct mp3 download + 'url': 'http://www.lrt.lt/mediateka/irasas/1013074524/', + 'md5': '389da8ca3cad0f51d12bed0c844f6a0a', + 'info_dict': { + 'id': '1013074524', + 'ext': 'mp3', + 'title': 'Kita tema 2016-09-05 15:05', + 'description': 'md5:1b295a8fc7219ed0d543fc228c931fb5', + 'duration': 3008, + 'view_count': int, + 'like_count': int, }, - } + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' - LRT') - m3u8_url = self._search_regex( - r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*location\.hash\.substring\(1\)', - webpage, 'm3u8 url', group='url') - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + + formats = [] + for _, file_url in re.findall( + r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): + ext = determine_ext(file_url) + if ext not in ('m3u8', 'mp3'): + continue + # mp3 served as m3u8 produces stuttered media file + if ext == 'm3u8' and '.mp3' in file_url: + continue + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, 'mp4', entry_protocol='m3u8_native', + fatal=False)) + elif ext == 'mp3': + formats.append({ + 'url': file_url, + 'vcodec': 'none', + }) self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index a98c4c530..da94eab56 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -73,7 +73,7 @@ class LyndaBaseIE(InfoExtractor): # Already logged in if any(re.search(p, signin_page) for p in ( - 'isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): + r'isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): return # Step 2: submit email @@ -94,12 +94,12 @@ class LyndaBaseIE(InfoExtractor): class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?lynda\.com/(?:[^/]+/[^/]+/(?P<course_id>\d+)|player/embed)/(?P<id>\d+)' _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]' _TESTS = [{ - 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', + 'url': 'https://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', # md5 is unstable 'info_dict': { 'id': '114408', @@ -112,19 +112,71 @@ class LyndaIE(LyndaBaseIE): 'only_matching': True, }] + def _raise_unavailable(self, video_id): + self.raise_login_required( + 'Video %s is only available for members' % video_id) + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + course_id = mobj.group('course_id') + + query = { + 'videoId': video_id, + 'type': 'video', + } video = self._download_json( - 'http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, - video_id, 'Downloading video JSON') + 'https://www.lynda.com/ajax/player', video_id, + 'Downloading video JSON', fatal=False, query=query) + + # Fallback scenario + if not video: + query['courseId'] = course_id + + play = self._download_json( + 'https://www.lynda.com/ajax/course/%s/%s/play' + % (course_id, video_id), video_id, 'Downloading play JSON') + + if not play: + self._raise_unavailable(video_id) + + formats = [] + for formats_dict in play: + urls = formats_dict.get('urls') + if not isinstance(urls, dict): + continue + cdn = formats_dict.get('name') + for format_id, format_url in urls.items(): + if not format_url: + continue + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (cdn, format_id) if cdn else format_id, + 'height': int_or_none(format_id), + }) + self._sort_formats(formats) + + conviva = self._download_json( + 'https://www.lynda.com/ajax/player/conviva', video_id, + 'Downloading conviva JSON', query=query) + + return { + 'id': video_id, + 'title': conviva['VideoTitle'], + 'description': conviva.get('VideoDescription'), + 'release_year': int_or_none(conviva.get('ReleaseYear')), + 'duration': int_or_none(conviva.get('Duration')), + 'creator': conviva.get('Author'), + 'formats': formats, + } if 'Status' in video: raise ExtractorError( 'lynda returned error: %s' % video['Message'], expected=True) if video.get('HasAccess') is False: - self.raise_login_required('Video %s is only available for members' % video_id) + self._raise_unavailable(video_id) video_id = compat_str(video.get('ID') or video_id) duration = int_or_none(video.get('DurationInSeconds')) @@ -148,7 +200,7 @@ class LyndaIE(LyndaBaseIE): for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): formats.extend([{ 'url': video_url, - 'width': int_or_none(format_id), + 'height': int_or_none(format_id), 'format_id': '%s-%s' % (prioritized_stream_id, format_id), } for format_id, video_url in prioritized_stream.items()]) @@ -187,7 +239,7 @@ class LyndaIE(LyndaBaseIE): return srt def _get_subtitles(self, video_id): - url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id + url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id subs = self._download_json(url, None, False) if subs: return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} @@ -209,7 +261,7 @@ class LyndaCourseIE(LyndaBaseIE): course_id = mobj.group('courseid') course = self._download_json( - 'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, + 'https://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, course_id, 'Downloading course JSON') if course.get('Status') == 'NotFound': @@ -231,7 +283,7 @@ class LyndaCourseIE(LyndaBaseIE): if video_id: entries.append({ '_type': 'url_transparent', - 'url': 'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id), + 'url': 'https://www.lynda.com/%s/%s-4.html' % (course_path, video_id), 'ie_key': LyndaIE.ie_key(), 'chapter': chapter.get('Title'), 'chapter_number': int_or_none(chapter.get('ChapterIndex')), diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py index 39d2742c8..9806875e8 100644 --- a/youtube_dl/extractor/m6.py +++ b/youtube_dl/extractor/m6.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/macgamestore.py b/youtube_dl/extractor/macgamestore.py index 3cd4a3a19..43db9929c 100644 --- a/youtube_dl/extractor/macgamestore.py +++ b/youtube_dl/extractor/macgamestore.py @@ -7,7 +7,7 @@ from ..utils import ExtractorError class MacGameStoreIE(InfoExtractor): IE_NAME = 'macgamestore' IE_DESC = 'MacGameStore trailers' - _VALID_URL = r'https?://www\.macgamestore\.com/mediaviewer\.php\?trailer=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?macgamestore\.com/mediaviewer\.php\?trailer=(?P<id>\d+)' _TEST = { 'url': 'http://www.macgamestore.com/mediaviewer.php?trailer=2450', diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 9a7098c43..f7cc3c832 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/mangomolo.py b/youtube_dl/extractor/mangomolo.py new file mode 100644 index 000000000..1885ac7df --- /dev/null +++ b/youtube_dl/extractor/mangomolo.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + int_or_none, +) + + +class MangomoloBaseIE(InfoExtractor): + def _get_real_id(self, page_id): + return page_id + + def _real_extract(self, url): + page_id = self._get_real_id(self._match_id(url)) + webpage = self._download_webpage(url, page_id) + hidden_inputs = self._hidden_inputs(webpage) + m3u8_entry_protocol = 'm3u8' if self._IS_LIVE else 'm3u8_native' + + format_url = self._html_search_regex( + [ + r'file\s*:\s*"(https?://[^"]+?/playlist.m3u8)', + r'<a[^>]+href="(rtsp://[^"]+)"' + ], webpage, 'format url') + formats = self._extract_wowza_formats( + format_url, page_id, m3u8_entry_protocol, ['smil']) + self._sort_formats(formats) + + return { + 'id': page_id, + 'title': self._live_title(page_id) if self._IS_LIVE else page_id, + 'uploader_id': hidden_inputs.get('userid'), + 'duration': int_or_none(hidden_inputs.get('duration')), + 'is_live': self._IS_LIVE, + 'formats': formats, + } + + +class MangomoloVideoIE(MangomoloBaseIE): + IE_NAME = 'mangomolo:video' + _VALID_URL = r'https?://admin\.mangomolo\.com/analytics/index\.php/customers/embed/video\?.*?\bid=(?P<id>\d+)' + _IS_LIVE = False + + +class MangomoloLiveIE(MangomoloBaseIE): + IE_NAME = 'mangomolo:live' + _VALID_URL = r'https?://admin\.mangomolo\.com/analytics/index\.php/customers/embed/index\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)' + _IS_LIVE = True + + def _get_real_id(self, page_id): + return base64.b64decode(compat_urllib_parse_unquote(page_id).encode()).decode() diff --git a/youtube_dl/extractor/matchtv.py b/youtube_dl/extractor/matchtv.py index 33b0b539f..bc9933a81 100644 --- a/youtube_dl/extractor/matchtv.py +++ b/youtube_dl/extractor/matchtv.py @@ -14,7 +14,7 @@ class MatchTVIE(InfoExtractor): 'info_dict': { 'id': 'matchtv-live', 'ext': 'flv', - 'title': 're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'title': r're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, }, 'params': { diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 2100583df..6e4290aad 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -72,7 +72,7 @@ class MDRIE(InfoExtractor): data_url = self._search_regex( r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1', - webpage, 'data url', group='url').replace('\/', '/') + webpage, 'data url', group='url').replace(r'\/', '/') doc = self._download_xml( compat_urlparse.urljoin(url, data_url), video_id) diff --git a/youtube_dl/extractor/meipai.py b/youtube_dl/extractor/meipai.py new file mode 100644 index 000000000..c8eacb4f4 --- /dev/null +++ b/youtube_dl/extractor/meipai.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + unified_timestamp, +) + + +class MeipaiIE(InfoExtractor): + IE_DESC = '美拍' + _VALID_URL = r'https?://(?:www\.)?meipai.com/media/(?P<id>[0-9]+)' + _TESTS = [{ + # regular uploaded video + 'url': 'http://www.meipai.com/media/531697625', + 'md5': 'e3e9600f9e55a302daecc90825854b4f', + 'info_dict': { + 'id': '531697625', + 'ext': 'mp4', + 'title': '#葉子##阿桑##余姿昀##超級女聲#', + 'description': '#葉子##阿桑##余姿昀##超級女聲#', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 152, + 'timestamp': 1465492420, + 'upload_date': '20160609', + 'view_count': 35511, + 'creator': '她她-TATA', + 'tags': ['葉子', '阿桑', '余姿昀', '超級女聲'], + } + }, { + # record of live streaming + 'url': 'http://www.meipai.com/media/585526361', + 'md5': 'ff7d6afdbc6143342408223d4f5fb99a', + 'info_dict': { + 'id': '585526361', + 'ext': 'mp4', + 'title': '姿昀和善願 練歌練琴啦😁😁😁', + 'description': '姿昀和善願 練歌練琴啦😁😁😁', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5975, + 'timestamp': 1474311799, + 'upload_date': '20160919', + 'view_count': 1215, + 'creator': '她她-TATA', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'<title[^>]*>([^<]+)', webpage, 'title') + + formats = [] + + # recorded playback of live streaming + m3u8_url = self._html_search_regex( + r'file:\s*encodeURIComponent\((["\'])(?P(?:(?!\1).)+)\1\)', + webpage, 'm3u8 url', group='url', default=None) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + + if not formats: + # regular uploaded video + video_url = self._search_regex( + r'data-video=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video url', + group='url', default=None) + if video_url: + formats.append({ + 'url': video_url, + 'format_id': 'http', + }) + + timestamp = unified_timestamp(self._og_search_property( + 'video:release_date', webpage, 'release date', fatal=False)) + + tags = self._og_search_property( + 'video:tag', webpage, 'tags', default='').split(',') + + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration')) + creator = self._og_search_property( + 'video:director', webpage, 'creator', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'creator': creator, + 'tags': tags, + 'formats': formats, + } diff --git a/youtube_dl/extractor/melonvod.py b/youtube_dl/extractor/melonvod.py new file mode 100644 index 000000000..bd8cf13ab --- /dev/null +++ b/youtube_dl/extractor/melonvod.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + urljoin, +) + + +class MelonVODIE(InfoExtractor): + _VALID_URL = r'https?://vod\.melon\.com/video/detail2\.html?\?.*?mvId=(?P[0-9]+)' + _TEST = { + 'url': 'http://vod.melon.com/video/detail2.htm?mvId=50158734', + 'info_dict': { + 'id': '50158734', + 'ext': 'mp4', + 'title': "Jessica 'Wonderland' MV Making Film", + 'thumbnail': r're:^https?://.*\.jpg$', + 'artist': 'Jessica (제시카)', + 'upload_date': '20161212', + 'duration': 203, + }, + 'params': { + 'skip_download': 'm3u8 download', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + play_info = self._download_json( + 'http://vod.melon.com/video/playerInfo.json', video_id, + note='Downloading player info JSON', query={'mvId': video_id}) + + title = play_info['mvInfo']['MVTITLE'] + + info = self._download_json( + 'http://vod.melon.com/delivery/streamingInfo.json', video_id, + note='Downloading streaming info JSON', + query={ + 'contsId': video_id, + 'contsType': 'VIDEO', + }) + + stream_info = info['streamingInfo'] + + formats = self._extract_m3u8_formats( + stream_info['encUrl'], video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + artist_list = play_info.get('artistList') + artist = None + if isinstance(artist_list, list): + artist = ', '.join( + [a['ARTISTNAMEWEBLIST'] + for a in artist_list if a.get('ARTISTNAMEWEBLIST')]) + + thumbnail = urljoin(info.get('staticDomain'), stream_info.get('imgPath')) + + duration = int_or_none(stream_info.get('playTime')) + upload_date = stream_info.get('mvSvcOpenDt', '')[:8] or None + + return { + 'id': video_id, + 'title': title, + 'artist': artist, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'formats': formats + } diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index e6e7659a1..9880924e6 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -133,7 +133,7 @@ class MetacafeIE(InfoExtractor): video_id, display_id = re.match(self._VALID_URL, url).groups() # the video may come from an external site - m_external = re.match('^(\w{2})-(.*)$', video_id) + m_external = re.match(r'^(\w{2})-(.*)$', video_id) if m_external is not None: prefix, ext_id = m_external.groups() # Check if video comes from YouTube diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 444ec0310..7d468d78b 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -9,7 +9,7 @@ from ..utils import ( class MetacriticIE(InfoExtractor): - _VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?metacritic\.com/.+?/trailers/(?P\d+)' _TESTS = [{ 'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', diff --git a/youtube_dl/extractor/mgoon.py b/youtube_dl/extractor/mgoon.py index 94bc87b00..7bb473900 100644 --- a/youtube_dl/extractor/mgoon.py +++ b/youtube_dl/extractor/mgoon.py @@ -27,7 +27,7 @@ class MgoonIE(InfoExtractor): 'upload_date': '20131220', 'ext': 'mp4', 'title': 'md5:543aa4c27a4931d371c3f433e8cebebc', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index 27bdff8b2..659ede8c2 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.mgtv\.com/v/(?:[^/]+/)*(?P\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?mgtv\.com/v/(?:[^/]+/)*(?P\d+)\.html' IE_DESC = '芒果TV' _TESTS = [{ @@ -18,7 +18,7 @@ class MGTVIE(InfoExtractor): 'title': '我是歌手第四季双年巅峰会:韩红李玟“双王”领军对抗', 'description': '我是歌手第四季双年巅峰会', 'duration': 7461, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { # no tbr extracted from stream_url diff --git a/youtube_dl/extractor/miaopai.py b/youtube_dl/extractor/miaopai.py new file mode 100644 index 000000000..f9e35ac7f --- /dev/null +++ b/youtube_dl/extractor/miaopai.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MiaoPaiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?miaopai\.com/show/(?P[-A-Za-z0-9~_]+)' + _TEST = { + 'url': 'http://www.miaopai.com/show/n~0hO7sfV1nBEw4Y29-Hqg__.htm', + 'md5': '095ed3f1cd96b821add957bdc29f845b', + 'info_dict': { + 'id': 'n~0hO7sfV1nBEw4Y29-Hqg__', + 'ext': 'mp4', + 'title': '西游记音乐会的秒拍视频', + 'thumbnail': 're:^https?://.*/n~0hO7sfV1nBEw4Y29-Hqg___m.jpg', + } + } + + _USER_AGENT_IPAD = 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD}) + + title = self._html_search_regex( + r'([^<]+)', webpage, 'title') + thumbnail = self._html_search_regex( + r']+class=(?P[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P[\'"])(?P[^\'"]+)(?P=q2)', + webpage, 'thumbnail', fatal=False, group='url') + videos = self._parse_html5_media_entries(url, webpage, video_id) + info = videos[0] + + info.update({ + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + }) + return info diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py index afd3e98ec..8e0aee0e6 100644 --- a/youtube_dl/extractor/microsoftvirtualacademy.py +++ b/youtube_dl/extractor/microsoftvirtualacademy.py @@ -71,12 +71,15 @@ class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): formats = [] for sources in settings.findall(compat_xpath('.//MediaSources')): - if sources.get('videoType') == 'smoothstreaming': - continue + sources_type = sources.get('videoType') for source in sources.findall(compat_xpath('./MediaSource')): video_url = source.text if not video_url or not video_url.startswith('http'): continue + if sources_type == 'smoothstreaming': + formats.extend(self._extract_ism_formats( + video_url, video_id, 'mss', fatal=False)) + continue video_mode = source.get('videoMode') height = int_or_none(self._search_regex( r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py index e6730b75a..dccc54249 100644 --- a/youtube_dl/extractor/minhateca.py +++ b/youtube_dl/extractor/minhateca.py @@ -19,7 +19,7 @@ class MinhatecaIE(InfoExtractor): 'id': '125848331', 'ext': 'mp4', 'title': 'youtube-dl test video', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'filesize_approx': 1530000, 'duration': 9, 'view_count': int, diff --git a/youtube_dl/extractor/ministrygrid.py b/youtube_dl/extractor/ministrygrid.py index e48eba3fa..8ad9239c5 100644 --- a/youtube_dl/extractor/ministrygrid.py +++ b/youtube_dl/extractor/ministrygrid.py @@ -8,7 +8,7 @@ from ..utils import ( class MinistryGridIE(InfoExtractor): - _VALID_URL = r'https?://www\.ministrygrid.com/([^/?#]*/)*(?P[^/#?]+)/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?ministrygrid\.com/([^/?#]*/)*(?P[^/#?]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers', @@ -17,7 +17,7 @@ class MinistryGridIE(InfoExtractor): 'id': '3453494717001', 'ext': 'mp4', 'title': 'The Gospel by Numbers', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'upload_date': '20140410', 'description': 'Coming soon from T4G 2014!', 'uploader_id': '2034960640001', diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 937ba0f28..ec1b4c4fe 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -25,10 +25,7 @@ class MioMioIE(InfoExtractor): 'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕', 'duration': 5923, }, - 'params': { - # The server provides broken file - 'skip_download': True, - } + 'skip': 'Unable to load videos', }, { 'url': 'http://www.miomio.tv/watch/cc184024/', 'info_dict': { @@ -47,16 +44,12 @@ class MioMioIE(InfoExtractor): 'skip': 'Unable to load videos', }, { # new 'h5' player - 'url': 'http://www.miomio.tv/watch/cc273295/', - 'md5': '', + 'url': 'http://www.miomio.tv/watch/cc273997/', + 'md5': '0b27a4b4495055d826813f8c3a6b2070', 'info_dict': { - 'id': '273295', + 'id': '273997', 'ext': 'mp4', - 'title': 'アウト×デラックス 20160526', - }, - 'params': { - # intermittent HTTP 500 - 'skip_download': True, + 'title': 'マツコの知らない世界【劇的進化SP!ビニール傘&冷凍食品2016】 1_2 - 16 05 31', }, }] @@ -116,7 +109,7 @@ class MioMioIE(InfoExtractor): player_webpage = self._download_webpage( player_url, video_id, note='Downloading player webpage', headers={'Referer': url}) - entries = self._parse_html5_media_entries(player_url, player_webpage) + entries = self._parse_html5_media_entries(player_url, player_webpage, video_id) http_headers = {'Referer': player_url} else: http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index cd169f361..79e0b8ada 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,19 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -import re +import uuid from .common import InfoExtractor from ..compat import ( + compat_str, compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( - get_element_by_attribute, int_or_none, - remove_start, extract_attributes, determine_ext, + smuggle_url, + parse_duration, ) @@ -72,76 +73,133 @@ class MiTeleBaseIE(InfoExtractor): } -class MiTeleIE(MiTeleBaseIE): +class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' - _VALID_URL = r'https?://www\.mitele\.es/(?:[^/]+/){3}(?P[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P[^/]+)/player' _TESTS = [{ - 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - # MD5 is unstable + 'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player', 'info_dict': { - 'id': '0NF1jJnxS1Wu3pHrmvFyw2', - 'display_id': 'programa-144', + 'id': '57b0dfb9c715da65618b4afa', 'ext': 'mp4', 'title': 'Tor, la web invisible', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', 'series': 'Diario de', 'season': 'La redacción', + 'season_number': 14, + 'season_id': 'diario_de_t14_11981', 'episode': 'Programa 144', - 'thumbnail': 're:(?i)^https?://.*\.jpg$', + 'episode_number': 3, + 'thumbnail': r're:(?i)^https?://.*\.jpg$', 'duration': 2913, }, + 'add_ie': ['Ooyala'], }, { # no explicit title - 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/temporada-6/programa-226/', + 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', 'info_dict': { - 'id': 'eLZSwoEd1S3pVyUm8lc6F', - 'display_id': 'programa-226', + 'id': '57b0de3dc915da14058b4876', 'ext': 'mp4', - 'title': 'Cuarto Milenio - Temporada 6 - Programa 226', - 'description': 'md5:50daf9fadefa4e62d9fc866d0c015701', + 'title': 'Cuarto Milenio Temporada 6 Programa 226', + 'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f', 'series': 'Cuarto Milenio', 'season': 'Temporada 6', + 'season_number': 6, + 'season_id': 'cuarto_milenio_t06_12715', 'episode': 'Programa 226', - 'thumbnail': 're:(?i)^https?://.*\.jpg$', - 'duration': 7312, + 'episode_number': 24, + 'thumbnail': r're:(?i)^https?://.*\.jpg$', + 'duration': 7313, }, 'params': { 'skip_download': True, }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - webpage = self._download_webpage(url, display_id) + gigya_url = self._search_regex( + r'[^>]*[^>]*[^>]*', + webpage, 'gigya', default=None) + gigya_sc = self._download_webpage( + compat_urlparse.urljoin('http://www.mitele.es/', gigya_url), + video_id, 'Downloading gigya script') - info = self._get_player_info(url, webpage) + # Get a appKey/uuid for getting the session key + appKey_var = self._search_regex( + r'value\s*\(\s*["\']appGridApplicationKey["\']\s*,\s*([0-9a-f]+)', + gigya_sc, 'appKey variable') + appKey = self._search_regex( + r'var\s+%s\s*=\s*["\']([0-9a-f]+)' % appKey_var, gigya_sc, 'appKey') - title = self._search_regex( - r'class="Destacado-text"[^>]*>\s*([^<]+)', - webpage, 'title', default=None) + session_json = self._download_json( + 'https://appgrid-api.cloud.accedo.tv/session', + video_id, 'Downloading session keys', query={ + 'appKey': appKey, + 'uuid': compat_str(uuid.uuid4()), + }) - mobj = re.search(r'''(?sx) - class="Destacado-text"[^>]*>.*?

\s* - (?P[^<]+)\s* - (?P[^<]+)\s* - (?P[^<]+)''', webpage) - series, season, episode = mobj.groups() if mobj else [None] * 3 + paths = self._download_json( + 'https://appgrid-api.cloud.accedo.tv/metadata/general_configuration,%20web_configuration', + video_id, 'Downloading paths JSON', + query={'sessionKey': compat_str(session_json['sessionKey'])}) - if not title: - if mobj: - title = '%s - %s - %s' % (series, season, episode) - else: - title = remove_start(self._search_regex( - r'([^<]+)', webpage, 'title'), 'Ver online ') + ooyala_s = paths['general_configuration']['api_configuration']['ooyala_search'] + source = self._download_json( + 'http://%s%s%s/docs/%s' % ( + ooyala_s['base_url'], ooyala_s['full_path'], + ooyala_s['provider_id'], video_id), + video_id, 'Downloading data JSON', query={ + 'include_titles': 'Series,Season', + 'product_name': 'test', + 'format': 'full', + })['hits']['hits'][0]['_source'] - info.update({ - 'display_id': display_id, + embedCode = source['offers'][0]['embed_codes'][0] + titles = source['localizable_titles'][0] + + title = titles.get('title_medium') or titles['title_long'] + + description = titles.get('summary_long') or titles.get('summary_medium') + + def get(key1, key2): + value1 = source.get(key1) + if not value1 or not isinstance(value1, list): + return + if not isinstance(value1[0], dict): + return + return value1[0].get(key2) + + series = get('localizable_titles_series', 'title_medium') + + season = get('localizable_titles_season', 'title_medium') + season_number = int_or_none(source.get('season_number')) + season_id = source.get('season_id') + + episode = titles.get('title_sort_name') + episode_number = int_or_none(source.get('episode_number')) + + duration = parse_duration(get('videos', 'duration')) + + return { + '_type': 'url_transparent', + # for some reason only HLS is supported + 'url': smuggle_url('ooyala:' + embedCode, {'supportedformats': 'm3u8,dash'}), + 'id': video_id, 'title': title, - 'description': get_element_by_attribute('class', 'text', webpage), + 'description': description, 'series': series, 'season': season, + 'season_number': season_number, + 'season_id': season_id, 'episode': episode, - }) - return info + 'episode_number': episode_number, + 'duration': duration, + 'thumbnail': get('images', 'url'), + } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 560fe188b..a24b3165a 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -16,13 +16,12 @@ from ..utils import ( clean_html, ExtractorError, OnDemandPagedList, - parse_count, str_to_int, ) class MixcloudIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' + _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' IE_NAME = 'mixcloud' _TESTS = [{ @@ -34,9 +33,8 @@ class MixcloudIE(InfoExtractor): 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', 'uploader': 'Daniel Holbach', 'uploader_id': 'dholbach', - 'thumbnail': 're:https?://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'view_count': int, - 'like_count': int, }, }, { 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', @@ -49,8 +47,10 @@ class MixcloudIE(InfoExtractor): 'uploader_id': 'gillespeterson', 'thumbnail': 're:https?://.*', 'view_count': int, - 'like_count': int, }, + }, { + 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', + 'only_matching': True, }] # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js @@ -86,26 +86,18 @@ class MixcloudIE(InfoExtractor): song_url = play_info['stream_url'] - PREFIX = ( - r'm-play-on-spacebar[^>]+' - r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+') - title = self._html_search_regex( - PREFIX + r'm-title="([^"]+)"', webpage, 'title') + title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') thumbnail = self._proto_relative_url(self._html_search_regex( - PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', - fatal=False)) + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) uploader = self._html_search_regex( - PREFIX + r'm-owner-name="([^"]+)"', - webpage, 'uploader', fatal=False) + r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) uploader_id = self._search_regex( r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) description = self._og_search_description(webpage) - like_count = parse_count(self._search_regex( - r'\bbutton-favorite[^>]+>.*?]+class=["\']toggle-number[^>]+>\s*([^<]+)', - webpage, 'like count', default=None)) view_count = str_to_int(self._search_regex( [r'([0-9,.]+)'], + r'/listeners/?">([0-9,.]+)', + r'm-tooltip=["\']([\d,.]+) plays'], webpage, 'play count', default=None)) return { @@ -117,7 +109,6 @@ class MixcloudIE(InfoExtractor): 'uploader': uploader, 'uploader_id': uploader_id, 'view_count': view_count, - 'like_count': like_count, } diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index e242b897f..59cd4b838 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -37,7 +37,7 @@ class MLBIE(InfoExtractor): 'duration': 66, 'timestamp': 1405980600, 'upload_date': '20140721', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -51,7 +51,7 @@ class MLBIE(InfoExtractor): 'duration': 46, 'timestamp': 1405105800, 'upload_date': '20140711', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -65,7 +65,7 @@ class MLBIE(InfoExtractor): 'duration': 488, 'timestamp': 1405399936, 'upload_date': '20140715', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -79,7 +79,7 @@ class MLBIE(InfoExtractor): 'duration': 52, 'timestamp': 1405390722, 'upload_date': '20140715', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { diff --git a/youtube_dl/extractor/mnet.py b/youtube_dl/extractor/mnet.py index e3f42e7bd..6a85dcbd5 100644 --- a/youtube_dl/extractor/mnet.py +++ b/youtube_dl/extractor/mnet.py @@ -22,7 +22,7 @@ class MnetIE(InfoExtractor): 'timestamp': 1451564040, 'age_limit': 0, 'thumbnails': 'mincount:5', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'ext': 'flv', }, 'params': { diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py index 978d5d5bf..44bcc4982 100644 --- a/youtube_dl/extractor/moevideo.py +++ b/youtube_dl/extractor/moevideo.py @@ -30,12 +30,13 @@ class MoeVideoIE(InfoExtractor): 'ext': 'flv', 'title': 'Sink cut out machine', 'description': 'md5:f29ff97b663aefa760bf7ca63c8ca8a8', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'width': 540, 'height': 360, 'duration': 179, 'filesize': 17822500, - } + }, + 'skip': 'Video has been removed', }, { 'url': 'http://playreplay.net/video/77107.7f325710a627383d40540d8e991a', @@ -45,7 +46,7 @@ class MoeVideoIE(InfoExtractor): 'ext': 'flv', 'title': 'Operacion Condor.', 'description': 'md5:7e68cb2fcda66833d5081c542491a9a3', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'width': 480, 'height': 296, 'duration': 6027, diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index e47c80119..54716f5c7 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -1,53 +1,56 @@ from __future__ import unicode_literals -import os -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, +from ..utils import ( + int_or_none, + str_to_int, + unified_strdate, ) -from ..utils import sanitized_Request +from .keezmovies import KeezMoviesIE -class MofosexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pmofosex\.com/videos/(?P[0-9]+)/.*?\.html)' - _TEST = { - 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', - 'md5': '1b2eb47ac33cc75d4a80e3026b613c5a', +class MofosexIE(KeezMoviesIE): + _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P\d+)/(?P[^/?#&.]+)\.html' + _TESTS = [{ + 'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html', + 'md5': '39a15853632b7b2e5679f92f69b78e91', 'info_dict': { - 'id': '5018', + 'id': '318131', + 'display_id': 'amateur-teen-playing-and-masturbating-318131', 'ext': 'mp4', - 'title': 'Japanese Teen Music Video', + 'title': 'amateur teen playing and masturbating', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20121114', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, 'age_limit': 18, } - } + }, { + # This video is no longer available + 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = 'http://www.' + mobj.group('url') + webpage, info = self._extract_info(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + view_count = str_to_int(self._search_regex( + r'VIEWS:\s*([\d,.]+)', webpage, 'view count', fatal=False)) + like_count = int_or_none(self._search_regex( + r'id=["\']amountLikes["\'][^>]*>(\d+)', webpage, + 'like count', fatal=False)) + dislike_count = int_or_none(self._search_regex( + r'id=["\']amountDislikes["\'][^>]*>(\d+)', webpage, + 'like count', fatal=False)) + upload_date = unified_strdate(self._html_search_regex( + r'Added:([^<]+)', webpage, 'upload date', fatal=False)) - video_title = self._html_search_regex(r'

(.+?)<', webpage, 'title') - video_url = compat_urllib_parse_unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url')) - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format = '-'.join(format) + info.update({ + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'upload_date': upload_date, + 'thumbnail': self._og_search_thumbnail(webpage), + }) - age_limit = self._rta_search(webpage) - - return { - 'id': video_id, - 'title': video_title, - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, - 'age_limit': age_limit, - } + return info diff --git a/youtube_dl/extractor/mojvideo.py b/youtube_dl/extractor/mojvideo.py index 0ba435dc5..165e658c9 100644 --- a/youtube_dl/extractor/mojvideo.py +++ b/youtube_dl/extractor/mojvideo.py @@ -20,7 +20,7 @@ class MojvideoIE(InfoExtractor): 'display_id': 'v-avtu-pred-mano-rdecelaska-alfi-nipic', 'ext': 'mp4', 'title': 'V avtu pred mano rdečelaska - Alfi Nipič', - 'thumbnail': 're:^http://.*\.jpg$', + 'thumbnail': r're:^http://.*\.jpg$', 'duration': 242, } } diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 5e1a8a71a..6fe3b6049 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -23,7 +23,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], 'upload_date': '20100913', 'uploader_id': 'famouslyfuckedup', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'age_limit': 18, } }, { @@ -37,7 +37,7 @@ class MotherlessIE(InfoExtractor): 'game', 'hairy'], 'upload_date': '20140622', 'uploader_id': 'Sulivana7x', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'age_limit': 18, }, 'skip': '404', @@ -51,7 +51,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['superheroine heroine superher'], 'upload_date': '20140827', 'uploader_id': 'shade0230', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'age_limit': 18, } }, { diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index 370328b36..c9d1ab64d 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -9,7 +9,7 @@ from ..compat import ( class MotorsportIE(InfoExtractor): IE_DESC = 'motorsport.com' - _VALID_URL = r'https?://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P[^/]+)/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P[^/]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/', 'info_dict': { diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index d0cb8278e..5453da1ac 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -11,7 +11,7 @@ from ..utils import ( class MovieClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/.+-(?P\d+)(?:\?|$)' + _VALID_URL = r'https?://(?:www\.)?movieclips\.com/videos/.+-(?P\d+)(?:\?|$)' _TEST = { 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597', 'md5': '42b5a0352d4933a7bd54f2104f481244', @@ -20,7 +20,7 @@ class MovieClipsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Warcraft Trailer 1', 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1446843055, 'upload_date': '20151106', 'uploader': 'Movieclips', diff --git a/youtube_dl/extractor/moviezine.py b/youtube_dl/extractor/moviezine.py index f130b75c4..85cc6e22f 100644 --- a/youtube_dl/extractor/moviezine.py +++ b/youtube_dl/extractor/moviezine.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -7,7 +7,7 @@ from .common import InfoExtractor class MoviezineIE(InfoExtractor): - _VALID_URL = r'https?://www\.moviezine\.se/video/(?P[^?#]+)' + _VALID_URL = r'https?://(?:www\.)?moviezine\.se/video/(?P[^?#]+)' _TEST = { 'url': 'http://www.moviezine.se/video/205866', @@ -16,7 +16,7 @@ class MoviezineIE(InfoExtractor): 'ext': 'mp4', 'title': 'Oculus - Trailer 1', 'description': 'md5:40cc6790fc81d931850ca9249b40e8a4', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, } diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/movingimage.py similarity index 63% rename from youtube_dl/extractor/ssa.py rename to youtube_dl/extractor/movingimage.py index 54d1843f2..4f62d628a 100644 --- a/youtube_dl/extractor/ssa.py +++ b/youtube_dl/extractor/movingimage.py @@ -7,21 +7,18 @@ from ..utils import ( ) -class SSAIE(InfoExtractor): - _VALID_URL = r'https?://ssa\.nls\.uk/film/(?P\d+)' +class MovingImageIE(InfoExtractor): + _VALID_URL = r'https?://movingimage\.nls\.uk/film/(?P\d+)' _TEST = { - 'url': 'http://ssa.nls.uk/film/3561', + 'url': 'http://movingimage.nls.uk/film/3561', + 'md5': '4caa05c2b38453e6f862197571a7be2f', 'info_dict': { 'id': '3561', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'SHETLAND WOOL', 'description': 'md5:c5afca6871ad59b4271e7704fe50ab04', 'duration': 900, - 'thumbnail': 're:^https?://.*\.jpg$', - }, - 'params': { - # rtmp download - 'skip_download': True, + 'thumbnail': r're:^https?://.*\.jpg$', }, } @@ -30,10 +27,9 @@ class SSAIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - streamer = self._search_regex( - r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer') - play_path = self._search_regex( - r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0] + formats = self._extract_m3u8_formats( + self._html_search_regex(r'file\s*:\s*"([^"]+)"', webpage, 'm3u8 manifest URL'), + video_id, ext='mp4', entry_protocol='m3u8_native') def search_field(field_name, fatal=False): return self._search_regex( @@ -44,13 +40,11 @@ class SSAIE(InfoExtractor): description = unescapeHTML(search_field('Description')) duration = parse_duration(search_field('Running time')) thumbnail = self._search_regex( - r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False) + r"image\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) return { 'id': video_id, - 'url': streamer, - 'play_path': play_path, - 'ext': 'flv', + 'formats': formats, 'title': title, 'description': description, 'duration': duration, diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py index 1ec8e0f50..1473bcf48 100644 --- a/youtube_dl/extractor/msn.py +++ b/youtube_dl/extractor/msn.py @@ -69,21 +69,15 @@ class MSNIE(InfoExtractor): if not format_url: continue ext = determine_ext(format_url) - # .ism is not yet supported (see - # https://github.com/rg3/youtube-dl/issues/8118) if ext == 'ism': - continue + formats.extend(self._extract_ism_formats( + format_url + '/Manifest', display_id, 'mss', fatal=False)) if 'm3u8' in format_url: # m3u8_native should not be used here until # https://github.com/rg3/youtube-dl/issues/9913 is fixed m3u8_formats = self._extract_m3u8_formats( format_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - # Despite metadata in m3u8 all video+audio formats are - # actually video-only (no audio) - for f in m3u8_formats: - if f.get('acodec') != 'none' and f.get('vcodec') != 'none': - f['acodec'] = 'none' formats.extend(m3u8_formats) else: formats.append({ diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 2f455680e..8acea1461 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_str, compat_xpath, ) @@ -14,12 +13,14 @@ from ..utils import ( fix_xml_ampersands, float_or_none, HEADRequest, + RegexNotFoundError, sanitized_Request, strip_or_none, timeconvert, + try_get, unescapeHTML, + update_url_query, url_basename, - RegexNotFoundError, xpath_text, ) @@ -36,14 +37,10 @@ class MTVServicesInfoExtractor(InfoExtractor): def _id_from_uri(uri): return uri.split(':')[-1] - # This was originally implemented for ComedyCentral, but it also works here - @classmethod - def _transform_rtmp_url(cls, rtmp_video_url): - m = re.match(r'^rtmpe?://.*?/(?Pgsp\..+?/.*)$', rtmp_video_url) - if not m: - return {'rtmp': rtmp_video_url} - base = 'http://viacommtvstrmfs.fplive.net/' - return {'http': base + m.group('finalid')} + @staticmethod + def _remove_template_parameter(url): + # Remove the templates, like &device={device} + return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url) def _get_feed_url(self, uri): return self._FEED_URL @@ -71,7 +68,7 @@ class MTVServicesInfoExtractor(InfoExtractor): url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1) return [{'url': url, 'ext': 'mp4'}] - def _extract_video_formats(self, mdoc, mtvn_id): + def _extract_video_formats(self, mdoc, mtvn_id, video_id): if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$', mdoc.find('.//src').text) is not None: if mtvn_id is not None and self._MOBILE_TEMPLATE is not None: self.to_screen('The normal version is not available from your ' @@ -82,21 +79,33 @@ class MTVServicesInfoExtractor(InfoExtractor): formats = [] for rendition in mdoc.findall('.//rendition'): - try: - _, _, ext = rendition.attrib['type'].partition('/') - rtmp_video_url = rendition.find('./src').text - if rtmp_video_url.endswith('siteunavail.png'): - continue - new_urls = self._transform_rtmp_url(rtmp_video_url) - formats.extend([{ - 'ext': 'flv' if new_url.startswith('rtmp') else ext, - 'url': new_url, - 'format_id': '-'.join(filter(None, [kind, rendition.get('bitrate')])), - 'width': int(rendition.get('width')), - 'height': int(rendition.get('height')), - } for kind, new_url in new_urls.items()]) - except (KeyError, TypeError): - raise ExtractorError('Invalid rendition field.') + if rendition.get('method') == 'hls': + hls_url = rendition.find('./src').text + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls')) + else: + # fms + try: + _, _, ext = rendition.attrib['type'].partition('/') + rtmp_video_url = rendition.find('./src').text + if 'error_not_available.swf' in rtmp_video_url: + raise ExtractorError( + '%s said: video is not available' % self.IE_NAME, + expected=True) + if rtmp_video_url.endswith('siteunavail.png'): + continue + formats.extend([{ + 'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext, + 'url': rtmp_video_url, + 'format_id': '-'.join(filter(None, [ + 'rtmp' if rtmp_video_url.startswith('rtmp') else None, + rendition.get('bitrate')])), + 'width': int(rendition.get('width')), + 'height': int(rendition.get('height')), + }]) + except (KeyError, TypeError): + raise ExtractorError('Invalid rendition field.') self._sort_formats(formats) return formats @@ -112,17 +121,17 @@ class MTVServicesInfoExtractor(InfoExtractor): } for typographic in transcript.findall('./typographic')] return subtitles - def _get_video_info(self, itemdoc): + def _get_video_info(self, itemdoc, use_hls=True): uri = itemdoc.find('guid').text video_id = self._id_from_uri(uri) self.report_extraction(video_id) content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) - mediagen_url = content_el.attrib['url'] - # Remove the templates, like &device={device} - mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) + mediagen_url = self._remove_template_parameter(content_el.attrib['url']) + mediagen_url = mediagen_url.replace('device={device}', '') if 'acceptMethods' not in mediagen_url: mediagen_url += '&' if '?' in mediagen_url else '?' - mediagen_url += 'acceptMethods=fms' + mediagen_url += 'acceptMethods=' + mediagen_url += 'hls' if use_hls else 'fms' mediagen_doc = self._download_xml(mediagen_url, video_id, 'Downloading video urls') @@ -163,9 +172,11 @@ class MTVServicesInfoExtractor(InfoExtractor): if mtvn_id_node is not None: mtvn_id = mtvn_id_node.text + formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id) + return { 'title': title, - 'formats': self._extract_video_formats(mediagen_doc, mtvn_id), + 'formats': formats, 'subtitles': self._extract_subtitles(mediagen_doc, mtvn_id), 'id': video_id, 'thumbnail': self._get_thumbnail_url(uri, itemdoc), @@ -178,15 +189,15 @@ class MTVServicesInfoExtractor(InfoExtractor): data = {'uri': uri} if self._LANG: data['lang'] = self._LANG - return compat_urllib_parse_urlencode(data) + return data - def _get_videos_info(self, uri): + def _get_videos_info(self, uri, use_hls=True): video_id = self._id_from_uri(uri) feed_url = self._get_feed_url(uri) - info_url = feed_url + '?' + self._get_feed_query(uri) - return self._get_videos_info_from_url(info_url, video_id) + info_url = update_url_query(feed_url, self._get_feed_query(uri)) + return self._get_videos_info_from_url(info_url, video_id, use_hls) - def _get_videos_info_from_url(self, url, video_id): + def _get_videos_info_from_url(self, url, video_id, use_hls=True): idoc = self._download_xml( url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) @@ -195,9 +206,30 @@ class MTVServicesInfoExtractor(InfoExtractor): description = xpath_text(idoc, './channel/description') return self.playlist_result( - [self._get_video_info(item) for item in idoc.findall('.//item')], + [self._get_video_info(item, use_hls) for item in idoc.findall('.//item')], playlist_title=title, playlist_description=description) + def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): + triforce_feed = self._parse_json(self._search_regex( + r'triforceManifestFeed\s*=\s*({.+?})\s*;\s*\n', webpage, + 'triforce feed', default='{}'), video_id, fatal=False) + + data_zone = self._search_regex( + r'data-zone=(["\'])(?P.+?_lc_promo.*?)\1', webpage, + 'data zone', default=data_zone, group='zone') + + feed_url = try_get( + triforce_feed, lambda x: x['manifest']['zones'][data_zone]['feed'], + compat_str) + if not feed_url: + return + + feed = self._download_json(feed_url, video_id, fatal=False) + if not feed: + return + + return try_get(feed, lambda x: x['result']['data']['id'], compat_str) + def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf @@ -218,7 +250,11 @@ class MTVServicesInfoExtractor(InfoExtractor): sm4_embed = self._html_search_meta( 'sm4:video:embed', webpage, 'sm4 embed', default='') mgid = self._search_regex( - r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid') + r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None) + + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + return mgid def _real_extract(self, url): @@ -256,13 +292,9 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): def _get_feed_url(self, uri): video_id = self._id_from_uri(uri) - site_id = uri.replace(video_id, '') - config_url = ('http://media.mtvnservices.com/pmt/e1/players/{0}/' - 'context4/context5/config.xml'.format(site_id)) - config_doc = self._download_xml(config_url, video_id) - feed_node = config_doc.find('.//feed') - feed_url = feed_node.text.strip().split('?')[0] - return feed_url + config = self._download_json( + 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id) + return self._remove_template_parameter(config['feedWithQueryParams']) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -271,6 +303,61 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): class MTVIE(MTVServicesInfoExtractor): + IE_NAME = 'mtv' + _VALID_URL = r'https?://(?:www\.)?mtv\.com/(?:video-clips|(?:full-)?episodes)/(?P[^/?#.]+)' + _FEED_URL = 'http://www.mtv.com/feeds/mrss/' + + _TESTS = [{ + 'url': 'http://www.mtv.com/video-clips/vl8qof/unlocking-the-truth-trailer', + 'md5': '1edbcdf1e7628e414a8c5dcebca3d32b', + 'info_dict': { + 'id': '5e14040d-18a4-47c4-a582-43ff602de88e', + 'ext': 'mp4', + 'title': 'Unlocking The Truth|July 18, 2016|1|101|Trailer', + 'description': '"Unlocking the Truth" premieres August 17th at 11/10c.', + 'timestamp': 1468846800, + 'upload_date': '20160718', + }, + }, { + 'url': 'http://www.mtv.com/full-episodes/94tujl/unlocking-the-truth-gates-of-hell-season-1-ep-101', + 'only_matching': True, + }, { + 'url': 'http://www.mtv.com/episodes/g8xu7q/teen-mom-2-breaking-the-wall-season-7-ep-713', + 'only_matching': True, + }] + + +class MTV81IE(InfoExtractor): + IE_NAME = 'mtv81' + _VALID_URL = r'https?://(?:www\.)?mtv81\.com/videos/(?P[^/?#.]+)' + + _TEST = { + 'url': 'http://www.mtv81.com/videos/artist-to-watch/the-godfather-of-japanese-hip-hop-segment-1/', + 'md5': '1edbcdf1e7628e414a8c5dcebca3d32b', + 'info_dict': { + 'id': '5e14040d-18a4-47c4-a582-43ff602de88e', + 'ext': 'mp4', + 'title': 'Unlocking The Truth|July 18, 2016|1|101|Trailer', + 'description': '"Unlocking the Truth" premieres August 17th at 11/10c.', + 'timestamp': 1468846800, + 'upload_date': '20160718', + }, + } + + def _extract_mgid(self, webpage): + return self._search_regex( + r'getTheVideo\((["\'])(?Pmgid:.+?)\1', webpage, + 'mgid', group='id') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + mgid = self._extract_mgid(webpage) + return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) + + +class MTVVideoIE(MTVServicesInfoExtractor): + IE_NAME = 'mtv:video' _VALID_URL = r'''(?x)^https?:// (?:(?:www\.)?mtv\.com/videos/.+?/(?P[0-9]+)/[^/]+$| m\.mtv\.com/videos/video\.rbml\?.*?id=(?P[^&]+))''' diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py index b4e8ad17e..2cc2bf229 100644 --- a/youtube_dl/extractor/muenchentv.py +++ b/youtube_dl/extractor/muenchentv.py @@ -22,7 +22,7 @@ class MuenchenTVIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, - 'thumbnail': 're:^https?://.*\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg$' }, 'params': { 'skip_download': True, @@ -36,7 +36,7 @@ class MuenchenTVIE(InfoExtractor): title = self._live_title(self._og_search_title(webpage)) data_js = self._search_regex( - r'(?s)\nplaylist:\s*(\[.*?}\]),related:', + r'(?s)\nplaylist:\s*(\[.*?}\]),', webpage, 'playlist configuration') data_json = js_to_json(data_js) data = json.loads(data_json)[0] diff --git a/youtube_dl/extractor/musicplayon.py b/youtube_dl/extractor/musicplayon.py index 2174e5665..1854d59a5 100644 --- a/youtube_dl/extractor/musicplayon.py +++ b/youtube_dl/extractor/musicplayon.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py index a103e0323..a67276596 100644 --- a/youtube_dl/extractor/mwave.py +++ b/youtube_dl/extractor/mwave.py @@ -9,21 +9,24 @@ from ..utils import ( class MwaveIE(InfoExtractor): - _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P[0-9]+)' + _VALID_URL = r'https?://mwave\.interest\.me/(?:[^/]+/)?mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P[0-9]+)' _URL_TEMPLATE = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=%s' - _TEST = { + _TESTS = [{ 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859', # md5 is unstable 'info_dict': { 'id': '168859', 'ext': 'flv', 'title': '[M COUNTDOWN] SISTAR - SHAKE IT', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'M COUNTDOWN', 'duration': 206, 'view_count': int, } - } + }, { + 'url': 'http://mwave.interest.me/en/mnettv/videodetail.m?searchVideoDetailVO.clip_id=176199', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -60,19 +63,22 @@ class MwaveIE(InfoExtractor): class MwaveMeetGreetIE(InfoExtractor): - _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P\d+)' - _TEST = { + _VALID_URL = r'https?://mwave\.interest\.me/(?:[^/]+/)?meetgreet/view/(?P\d+)' + _TESTS = [{ 'url': 'http://mwave.interest.me/meetgreet/view/256', 'info_dict': { 'id': '173294', 'ext': 'flv', 'title': '[MEET&GREET] Park BoRam', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Mwave', 'duration': 3634, 'view_count': int, } - } + }, { + 'url': 'http://mwave.interest.me/en/meetgreet/view/256', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index 0d5238d77..f281238c9 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -17,9 +17,10 @@ class MySpaceIE(InfoExtractor): _TESTS = [ { 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', + 'md5': '9c1483c106f4a695c47d2911feed50a7', 'info_dict': { 'id': '109594919', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Little Big Town', 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', 'uploader': 'Five Minutes to the Stage', @@ -27,37 +28,30 @@ class MySpaceIE(InfoExtractor): 'timestamp': 1414108751, 'upload_date': '20141023', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, # songs { 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', + 'md5': '1d7ee4604a3da226dd69a123f748b262', 'info_dict': { 'id': '93388656', - 'ext': 'flv', + 'ext': 'm4a', 'title': 'Of weakened soul...', 'uploader': 'Killsorrow', 'uploader_id': 'killsorrow', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { - 'add_ie': ['Vevo'], + 'add_ie': ['Youtube'], 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', 'info_dict': { - 'id': 'USZM20600099', - 'ext': 'mp4', - 'title': 'Animal I Have Become', - 'uploader': 'Three Days Grace', - 'timestamp': int, - 'upload_date': '20060502', + 'id': 'xqds0B_meys', + 'ext': 'webm', + 'title': 'Three Days Grace - Animal I Have Become', + 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', + 'uploader': 'ThreeDaysGraceVEVO', + 'uploader_id': 'ThreeDaysGraceVEVO', + 'upload_date': '20091002', }, - 'skip': 'VEVO is only available in some countries', }, { 'add_ie': ['Youtube'], 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', @@ -76,24 +70,46 @@ class MySpaceIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + is_song = mobj.group('mediatype').startswith('music/song') webpage = self._download_webpage(url, video_id) player_url = self._search_regex( - r'playerSwf":"([^"?]*)', webpage, 'player URL') + r'videoSwf":"([^"?]*)', webpage, 'player URL', fatal=False) - def rtmp_format_from_stream_url(stream_url, width=None, height=None): - rtmp_url, play_path = stream_url.split(';', 1) - return { - 'format_id': 'rtmp', - 'url': rtmp_url, - 'play_path': play_path, - 'player_url': player_url, - 'protocol': 'rtmp', - 'ext': 'flv', - 'width': width, - 'height': height, - } + def formats_from_stream_urls(stream_url, hls_stream_url, http_stream_url, width=None, height=None): + formats = [] + vcodec = 'none' if is_song else None + if hls_stream_url: + formats.append({ + 'format_id': 'hls', + 'url': hls_stream_url, + 'protocol': 'm3u8_native', + 'ext': 'm4a' if is_song else 'mp4', + 'vcodec': vcodec, + }) + if stream_url and player_url: + rtmp_url, play_path = stream_url.split(';', 1) + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': play_path, + 'player_url': player_url, + 'protocol': 'rtmp', + 'ext': 'flv', + 'width': width, + 'height': height, + 'vcodec': vcodec, + }) + if http_stream_url: + formats.append({ + 'format_id': 'http', + 'url': http_stream_url, + 'width': width, + 'height': height, + 'vcodec': vcodec, + }) + return formats - if mobj.group('mediatype').startswith('music/song'): + if is_song: # songs don't store any useful info in the 'context' variable song_data = self._search_regex( r'''.*?)\1''' % name, song_data, name, default='', group='data') - stream_url = search_data('stream-url') - if not stream_url: + formats = formats_from_stream_urls( + search_data('stream-url'), search_data('hls-stream-url'), + search_data('http-stream-url')) + if not formats: vevo_id = search_data('vevo-id') youtube_id = search_data('youtube-id') if vevo_id: @@ -121,6 +139,7 @@ class MySpaceIE(InfoExtractor): else: raise ExtractorError( 'Found song but don\'t know how to download it') + self._sort_formats(formats) return { 'id': video_id, 'title': self._og_search_title(webpage), @@ -128,27 +147,16 @@ class MySpaceIE(InfoExtractor): 'uploader_id': search_data('artist-username'), 'thumbnail': self._og_search_thumbnail(webpage), 'duration': int_or_none(search_data('duration')), - 'formats': [rtmp_format_from_stream_url(stream_url)] + 'formats': formats, } else: video = self._parse_json(self._search_regex( r'context = ({.*?});', webpage, 'context'), video_id)['video'] - formats = [] - hls_stream_url = video.get('hlsStreamUrl') - if hls_stream_url: - formats.append({ - 'format_id': 'hls', - 'url': hls_stream_url, - 'protocol': 'm3u8_native', - 'ext': 'mp4', - }) - stream_url = video.get('streamUrl') - if stream_url: - formats.append(rtmp_format_from_stream_url( - stream_url, - int_or_none(video.get('width')), - int_or_none(video.get('height')))) + formats = formats_from_stream_urls( + video.get('streamUrl'), video.get('hlsStreamUrl'), + video.get('mp4StreamUrl'), int_or_none(video.get('width')), + int_or_none(video.get('height'))) self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 1ca7b1a9e..2afe535b5 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -11,7 +11,7 @@ from ..utils import ( class MySpassIE(InfoExtractor): - _VALID_URL = r'https?://www\.myspass\.de/.*' + _VALID_URL = r'https?://(?:www\.)?myspass\.de/.*' _TEST = { 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', 'md5': '0b49f4844a068f8b33f4b7c88405862b', diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py index 4c65be122..621ae74a7 100644 --- a/youtube_dl/extractor/myvi.py +++ b/youtube_dl/extractor/myvi.py @@ -27,7 +27,7 @@ class MyviIE(SprutoBaseIE): 'id': 'f16b2bbd-cde8-481c-a981-7cd48605df43', 'ext': 'mp4', 'title': 'хозяин жизни', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 25, }, }, { diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index 6d447a493..6bb64eb63 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -160,7 +160,7 @@ class MyVideoIE(InfoExtractor): else: video_playpath = '' - video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj') + video_swfobj = self._search_regex(r'swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj') video_swfobj = compat_urllib_parse_unquote(video_swfobj) video_title = self._html_search_regex("(.*?)

", diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py index 731c24542..2117d302d 100644 --- a/youtube_dl/extractor/myvidster.py +++ b/youtube_dl/extractor/myvidster.py @@ -13,7 +13,7 @@ class MyVidsterIE(InfoExtractor): 'id': '3685814', 'title': 'md5:7d8427d6d02c4fbcef50fe269980c749', 'upload_date': '20141027', - 'uploader_id': 'utkualp', + 'uploader': 'utkualp', 'ext': 'mp4', 'age_limit': 18, }, diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 0027ff1b8..b91d86528 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .adobepass import AdobePassIE from .theplatform import ThePlatformIE from ..utils import ( smuggle_url, @@ -65,7 +66,7 @@ class NationalGeographicVideoIE(InfoExtractor): } -class NationalGeographicIE(ThePlatformIE): +class NationalGeographicIE(ThePlatformIE, AdobePassIE): IE_NAME = 'natgeo' _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/(?:videos|episodes)/(?P[^/?]+)' @@ -110,28 +111,42 @@ class NationalGeographicIE(ThePlatformIE): release_url = self._search_regex( r'video_auth_playlist_url\s*=\s*"([^"]+)"', webpage, 'release url') + theplatform_path = self._search_regex(r'https?://link.theplatform.com/s/([^?]+)', release_url, 'theplatform path') + video_id = theplatform_path.split('/')[-1] query = { 'mbr': 'true', - 'switch': 'http', } is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False) if is_auth == 'auth': auth_resource_id = self._search_regex( r"video_auth_resourceId\s*=\s*'([^']+)'", webpage, 'auth resource id') - query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) or '' + query['auth'] = self._extract_mvpd_auth(url, video_id, 'natgeo', auth_resource_id) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url( - update_url_query(release_url, query), - {'force_smil_url': True}), + formats = [] + subtitles = {} + for key, value in (('switch', 'http'), ('manifest', 'm3u')): + tp_query = query.copy() + tp_query.update({ + key: value, + }) + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query(release_url, tp_query), video_id, 'Downloading %s SMIL data' % value) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + self._sort_formats(formats) + + info = self._extract_theplatform_metadata(theplatform_path, display_id) + info.update({ + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, 'display_id': display_id, - } + }) + return info -class NationalGeographicEpisodeGuideIE(ThePlatformIE): +class NationalGeographicEpisodeGuideIE(InfoExtractor): IE_NAME = 'natgeo:episodeguide' _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?(?P[^/]+)/episode-guide' _TESTS = [ diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 0891d2772..e8131333f 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -12,10 +12,10 @@ from ..utils import ( class NaverIE(InfoExtractor): - _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P\d+)' + _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/v/(?P\d+)' _TESTS = [{ - 'url': 'http://tvcast.naver.com/v/81652', + 'url': 'http://tv.naver.com/v/81652', 'info_dict': { 'id': '81652', 'ext': 'mp4', @@ -24,7 +24,7 @@ class NaverIE(InfoExtractor): 'upload_date': '20130903', }, }, { - 'url': 'http://tvcast.naver.com/v/395837', + 'url': 'http://tv.naver.com/v/395837', 'md5': '638ed4c12012c458fefcddfd01f173cd', 'info_dict': { 'id': '395837', @@ -34,6 +34,9 @@ class NaverIE(InfoExtractor): 'upload_date': '20150519', }, 'skip': 'Georestricted', + }, { + 'url': 'http://tvcast.naver.com/v/81652', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index d896b0d04..53561961c 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,25 +1,20 @@ from __future__ import unicode_literals import functools -import os.path import re -from .common import InfoExtractor +from .turner import TurnerBaseIE from ..compat import ( compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( - int_or_none, OnDemandPagedList, - parse_duration, remove_start, - xpath_text, - xpath_attr, ) -class NBAIE(InfoExtractor): +class NBAIE(TurnerBaseIE): _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P(?:[^/]+/)+(?P[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', @@ -44,28 +39,30 @@ class NBAIE(InfoExtractor): 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': '0041400301-cle-atl-recap', + 'id': 'channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'ext': 'mp4', 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, 'timestamp': 1432134543, 'upload_date': '20150520', - } + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake', 'info_dict': { - 'id': '1455672027478-Doc_Feb16_720', + 'id': 'teams/clippers/2016/02/17/1455672027478-Doc_Feb16_720.mov-297324', 'ext': 'mp4', 'title': 'Practice: Doc Rivers - 2/16/16', 'description': 'Head Coach Doc Rivers addresses the media following practice.', - 'upload_date': '20160217', + 'upload_date': '20160216', 'timestamp': 1455672000, }, 'params': { # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', 'info_dict': { @@ -80,7 +77,7 @@ class NBAIE(InfoExtractor): }, { 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', 'info_dict': { - 'id': 'Wigginsmp4', + 'id': 'teams/timberwolves/2014/12/12/Wigginsmp4-3462601', 'ext': 'mp4', 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.', @@ -92,6 +89,7 @@ class NBAIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }] _PAGE_SIZE = 30 @@ -145,53 +143,12 @@ class NBAIE(InfoExtractor): if path.startswith('video/teams'): path = 'video/channels/proxy/' + path[6:] - video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id) - video_id = os.path.splitext(xpath_text(video_info, 'slug'))[0] - title = xpath_text(video_info, 'headline') - description = xpath_text(video_info, 'description') - duration = parse_duration(xpath_text(video_info, 'length')) - timestamp = int_or_none(xpath_attr(video_info, 'dateCreated', 'uts')) - - thumbnails = [] - for image in video_info.find('images'): - thumbnails.append({ - 'id': image.attrib.get('cut'), - 'url': image.text, - 'width': int_or_none(image.attrib.get('width')), - 'height': int_or_none(image.attrib.get('height')), + return self._extract_cvp_info( + 'http://www.nba.com/%s.xml' % path, video_id, { + 'default': { + 'media_src': 'http://nba.cdn.turner.com/nba/big', + }, + 'm3u8': { + 'media_src': 'http://nbavod-f.akamaihd.net', + }, }) - - formats = [] - for video_file in video_info.findall('.//file'): - video_url = video_file.text - if video_url.startswith('/'): - continue - if video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) - elif video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds', fatal=False)) - else: - key = video_file.attrib.get('bitrate') - format_info = { - 'format_id': key, - 'url': video_url, - } - mobj = re.search(r'(\d+)x(\d+)(?:_(\d+))?', key) - if mobj: - format_info.update({ - 'width': int(mobj.group(1)), - 'height': int(mobj.group(2)), - 'tbr': int_or_none(mobj.group(3)), - }) - formats.append(format_info) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index f694e210b..434a94de4 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,11 +9,12 @@ from ..utils import ( lowercase_escape, smuggle_url, unescapeHTML, + update_url_query, ) class NBCIE(InfoExtractor): - _VALID_URL = r'https?://www\.nbc\.com/(?:[^/]+/)+(?Pn?\d+)' + _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?Pn?\d+)' _TESTS = [ { @@ -138,7 +139,7 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): # Does not include https because its certificate is invalid - _VALID_URL = r'https?://www\.nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' _TEST = { 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', @@ -161,7 +162,7 @@ class NBCSportsIE(InfoExtractor): class CSNNEIE(InfoExtractor): - _VALID_URL = r'https?://www\.csnne\.com/video/(?P[0-9a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' _TEST = { 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter', @@ -208,7 +209,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': '269389891880', + 'id': 'p_tweet_snow_140529', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', @@ -232,7 +233,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': '394064451844', + 'id': 'nn_netcast_150204', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', @@ -245,7 +246,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', 'md5': 'a49e173825e5fcd15c13fc297fced39d', 'info_dict': { - 'id': '529953347624', + 'id': 'x_lon_vwhorn_150922', 'ext': 'mp4', 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', 'description': 'md5:c8be487b2d80ff0594c005add88d8351', @@ -258,7 +259,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', 'md5': '118d7ca3f0bea6534f119c68ef539f71', 'info_dict': { - 'id': '669831235788', + 'id': 'tdy_al_space_160420', 'ext': 'mp4', 'title': 'See the aurora borealis from space in stunning new NASA video', 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', @@ -271,15 +272,14 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', 'info_dict': { - 'id': '314487875924', + 'id': 'n_hayes_Aimm_140801_272214', 'ext': 'mp4', 'title': 'The chaotic GOP immigration vote', 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', 'uploader': 'NBCU-NEWS', - 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], }, }, { @@ -311,27 +311,80 @@ class NBCNewsIE(ThePlatformIE): else: # "feature" and "nightly-news" pages use theplatform.com video_id = mobj.group('mpx_id') - if not video_id.isdigit(): - webpage = self._download_webpage(url, video_id) - info = None - bootstrap_json = self._search_regex( - [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', - r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], - webpage, 'bootstrap json', default=None) + webpage = self._download_webpage(url, video_id) + + filter_param = 'byId' + bootstrap_json = self._search_regex( + [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', + r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"', + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);'], + webpage, 'bootstrap json', default=None) + if bootstrap_json: bootstrap = self._parse_json( bootstrap_json, video_id, transform_source=unescapeHTML) + + info = None if 'results' in bootstrap: info = bootstrap['results'][0]['video'] elif 'video' in bootstrap: info = bootstrap['video'] + elif 'msnbcVideoInfo' in bootstrap: + info = bootstrap['msnbcVideoInfo']['meta'] + elif 'msnbcThePlatform' in bootstrap: + info = bootstrap['msnbcThePlatform']['videoPlayer']['video'] else: info = bootstrap - video_id = info['mpxId'] + + if 'guid' in info: + video_id = info['guid'] + filter_param = 'byGuid' + elif 'mpxId' in info: + video_id = info['mpxId'] return { '_type': 'url_transparent', 'id': video_id, # http://feed.theplatform.com/f/2E2eJC/nbcnews also works - 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id, + 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {filter_param: video_id}), 'ie_key': 'ThePlatformFeed', } + + +class NBCOlympicsIE(InfoExtractor): + _VALID_URL = r'https?://www\.nbcolympics\.com/video/(?P[a-z-]+)' + + _TEST = { + # Geo-restricted to US + 'url': 'http://www.nbcolympics.com/video/justin-roses-son-leo-was-tears-after-his-dad-won-gold', + 'md5': '54fecf846d05429fbaa18af557ee523a', + 'info_dict': { + 'id': 'WjTBzDXx5AUq', + 'display_id': 'justin-roses-son-leo-was-tears-after-his-dad-won-gold', + 'ext': 'mp4', + 'title': 'Rose\'s son Leo was in tears after his dad won gold', + 'description': 'Olympic gold medalist Justin Rose gets emotional talking to the impact his win in men\'s golf has already had on his children.', + 'timestamp': 1471274964, + 'upload_date': '20160815', + 'uploader': 'NBCU-SPORTS', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + + iframe_url = drupal_settings['vod']['iframe_url'] + theplatform_url = iframe_url.replace( + 'vplayer.nbcolympics.com', 'player.theplatform.com') + + return { + '_type': 'url_transparent', + 'url': theplatform_url, + 'ie_key': ThePlatformIE.ie_key(), + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 0cded6b5c..07528d140 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -23,7 +23,7 @@ class NDRBaseIE(InfoExtractor): class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', @@ -105,7 +105,7 @@ class NDRIE(NDRBaseIE): class NJoyIE(NDRBaseIE): IE_NAME = 'njoy' IE_DESC = 'N-JOY' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?:(?P[^/?#]+),)?(?P[\da-z]+)\.html' + _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?:(?P[^/?#]+),)?(?P[\da-z]+)\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', @@ -238,7 +238,7 @@ class NDREmbedBaseIE(InfoExtractor): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:player|externalPlayer)\.html' + _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', @@ -302,7 +302,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'info_dict': { 'id': 'livestream217', 'ext': 'flv', - 'title': 're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'title': r're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, 'upload_date': '20150910', }, @@ -332,7 +332,7 @@ class NDREmbedIE(NDREmbedBaseIE): class NJoyEmbedIE(NDREmbedBaseIE): IE_NAME = 'njoy:embed' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' + _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' _TESTS = [{ # httpVideo 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html', @@ -367,7 +367,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'info_dict': { 'id': 'webradioweltweit100', 'ext': 'mp3', - 'title': 're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'title': r're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, 'uploader': 'njoy', 'upload_date': '20150810', diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py index 96528f649..255f60878 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/youtube_dl/extractor/ndtv.py @@ -21,7 +21,7 @@ class NDTVIE(InfoExtractor): 'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02', 'upload_date': '20131208', 'duration': 1327, - 'thumbnail': 're:https?://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', }, } diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py index 0d165a82a..aec3026b1 100644 --- a/youtube_dl/extractor/netzkino.py +++ b/youtube_dl/extractor/netzkino.py @@ -25,7 +25,7 @@ class NetzkinoIE(InfoExtractor): 'comments': 'mincount:3', 'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28', 'upload_date': '20120813', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'timestamp': 1344858571, 'age_limit': 12, }, diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index 705940323..9bea610c8 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -1,15 +1,12 @@ from __future__ import unicode_literals -import json -import re - from .common import InfoExtractor class NewgroundsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P[0-9]+)' _TESTS = [{ - 'url': 'http://www.newgrounds.com/audio/listen/549479', + 'url': 'https://www.newgrounds.com/audio/listen/549479', 'md5': 'fe6033d297591288fa1c1f780386f07a', 'info_dict': { 'id': '549479', @@ -18,7 +15,7 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'Burn7', } }, { - 'url': 'http://www.newgrounds.com/portal/view/673111', + 'url': 'https://www.newgrounds.com/portal/view/673111', 'md5': '3394735822aab2478c31b1004fe5e5bc', 'info_dict': { 'id': '673111', @@ -29,24 +26,20 @@ class NewgroundsIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - music_id = mobj.group('id') - webpage = self._download_webpage(url, music_id) + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) title = self._html_search_regex( r'([^>]+)', webpage, 'title') uploader = self._html_search_regex( - [r',"artist":"([^"]+)",', r'[\'"]owner[\'"]\s*:\s*[\'"]([^\'"]+)[\'"],'], - webpage, 'uploader') + r'Author\s*]+>([^<]+)', webpage, 'uploader', fatal=False) - music_url_json_string = self._html_search_regex( - r'({"url":"[^"]+"),', webpage, 'music url') + '}' - music_url_json = json.loads(music_url_json_string) - music_url = music_url_json['url'] + music_url = self._parse_json(self._search_regex( + r'"url":("[^"]+"),', webpage, ''), media_id) return { - 'id': music_id, + 'id': media_id, 'title': title, 'url': music_url, 'uploader': uploader, diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py index 0092b85ce..e3f35f1d8 100644 --- a/youtube_dl/extractor/newstube.py +++ b/youtube_dl/extractor/newstube.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index aae7aeeeb..680f03aad 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -2,12 +2,20 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..compat import compat_urlparse +from ..utils import ( + clean_html, + get_element_by_class, + int_or_none, + parse_iso8601, + remove_start, + unified_timestamp, +) class NextMediaIE(InfoExtractor): IE_DESC = '蘋果日報' - _VALID_URL = r'https?://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P\d+)/(?P\d+)' + _VALID_URL = r'https?://hk\.apple\.nextmedia\.com/[^/]+/[^/]+/(?P\d+)/(?P\d+)' _TESTS = [{ 'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199', 'md5': 'dff9fad7009311c421176d1ac90bfe4f', @@ -15,7 +23,7 @@ class NextMediaIE(InfoExtractor): 'id': '53109199', 'ext': 'mp4', 'title': '【佔領金鐘】50外國領事議員撐場 讚學生勇敢香港有希望', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:28222b9912b6665a21011b034c70fcc7', 'timestamp': 1415456273, 'upload_date': '20141108', @@ -30,6 +38,12 @@ class NextMediaIE(InfoExtractor): return self._extract_from_nextmedia_page(news_id, url, page) def _extract_from_nextmedia_page(self, news_id, url, page): + redirection_url = self._search_regex( + r'window\.location\.href\s*=\s*([\'"])(?P(?!\1).+)\1', + page, 'redirection URL', default=None, group='url') + if redirection_url: + return self.url_result(compat_urlparse.urljoin(url, redirection_url)) + title = self._fetch_title(page) video_url = self._search_regex(self._URL_PATTERN, page, 'video url') @@ -68,7 +82,7 @@ class NextMediaIE(InfoExtractor): class NextMediaActionNewsIE(NextMediaIE): IE_DESC = '蘋果日報 - 動新聞' - _VALID_URL = r'https?://hk.dv.nextmedia.com/actionnews/[^/]+/(?P\d+)/(?P\d+)/\d+' + _VALID_URL = r'https?://hk\.dv\.nextmedia\.com/actionnews/[^/]+/(?P\d+)/(?P\d+)/\d+' _TESTS = [{ 'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460', 'md5': '05fce8ffeed7a5e00665d4b7cf0f9201', @@ -76,7 +90,7 @@ class NextMediaActionNewsIE(NextMediaIE): 'id': '19009428', 'ext': 'mp4', 'title': '【壹週刊】細10年男友偷食 50歲邵美琪再失戀', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:cd802fad1f40fd9ea178c1e2af02d659', 'timestamp': 1421791200, 'upload_date': '20150120', @@ -93,7 +107,7 @@ class NextMediaActionNewsIE(NextMediaIE): class AppleDailyIE(NextMediaIE): IE_DESC = '臺灣蘋果日報' - _VALID_URL = r'https?://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P\d+)/(?P\d+)(/.*)?' + _VALID_URL = r'https?://(www|ent)\.appledaily\.com\.tw/[^/]+/[^/]+/[^/]+/(?P\d+)/(?P\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', @@ -101,7 +115,7 @@ class AppleDailyIE(NextMediaIE): 'id': '36354694', 'ext': 'mp4', 'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4', 'upload_date': '20150128', } @@ -112,7 +126,7 @@ class AppleDailyIE(NextMediaIE): 'id': '550549', 'ext': 'mp4', 'title': '不滿被踩腳 山東兩大媽一路打下車', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:175b4260c1d7c085993474217e4ab1b4', 'upload_date': '20150128', } @@ -123,7 +137,7 @@ class AppleDailyIE(NextMediaIE): 'id': '5003671', 'ext': 'mp4', 'title': '20正妹熱舞 《刀龍傳說Online》火辣上市', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd', 'upload_date': '20150128', }, @@ -150,10 +164,17 @@ class AppleDailyIE(NextMediaIE): 'id': '35770334', 'ext': 'mp4', 'title': '咖啡占卜測 XU裝熟指數', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:7b859991a6a4fedbdf3dd3b66545c748', 'upload_date': '20140417', }, + }, { + 'url': 'http://www.appledaily.com.tw/actionnews/appledaily/7/20161003/960588/', + 'only_matching': True, + }, { + # Redirected from http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694 + 'url': 'http://ent.appledaily.com.tw/section/article/headline/20150128/36354694', + 'only_matching': True, }] _URL_PATTERN = r'\{url: \'(.+)\'\}' @@ -170,3 +191,48 @@ class AppleDailyIE(NextMediaIE): def _fetch_description(self, page): return self._html_search_meta('description', page, 'news description') + + +class NextTVIE(InfoExtractor): + IE_DESC = '壹電視' + _VALID_URL = r'https?://(?:www\.)?nexttv\.com\.tw/(?:[^/]+/)+(?P\d+)' + + _TEST = { + 'url': 'http://www.nexttv.com.tw/news/realtime/politics/11779671', + 'info_dict': { + 'id': '11779671', + 'ext': 'mp4', + 'title': '「超收稅」近4千億! 藍議員籲發消費券', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1484825400, + 'upload_date': '20170119', + 'view_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r']*>([^<]+)

', webpage, 'title') + + data = self._hidden_inputs(webpage) + + video_url = data['ntt-vod-src-detailview'] + + date_str = get_element_by_class('date', webpage) + timestamp = unified_timestamp(date_str + '+0800') if date_str else None + + view_count = int_or_none(remove_start( + clean_html(get_element_by_class('click', webpage)), '點閱:')) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': data.get('ntt-vod-img-src'), + 'timestamp': timestamp, + 'view_count': view_count, + } diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 200874d68..460deb162 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -72,7 +72,7 @@ class NFLIE(InfoExtractor): 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', 'upload_date': '20140921', 'timestamp': 1411337580, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', @@ -84,7 +84,7 @@ class NFLIE(InfoExtractor): 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', 'upload_date': '20131229', 'timestamp': 1388354455, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish', @@ -165,7 +165,7 @@ class NFLIE(InfoExtractor): group='config')) # For articles, the id in the url is not the video id video_id = self._search_regex( - r'(?:]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P.+?)\1', + r'(?:]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', default=video_id, group='id') config = self._download_json(config_url, video_id, 'Downloading player config') url_template = NFLIE.prepend_host( diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py new file mode 100644 index 000000000..5c8cd76dc --- /dev/null +++ b/youtube_dl/extractor/nhk.py @@ -0,0 +1,51 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class NhkVodIE(InfoExtractor): + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P[^/]+/[^/?#&]+)' + _TEST = { + # Videos available only for a limited period of time. Visit + # http://www3.nhk.or.jp/nhkworld/en/vod/ for working samples. + 'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815', + 'info_dict': { + 'id': 'A1bnNiNTE6nY3jLllS-BIISfcC_PpvF5', + 'ext': 'flv', + 'title': 'TOKYO FASHION EXPRESS - The Kimono as Global Fashion', + 'description': 'md5:db338ee6ce8204f415b754782f819824', + 'series': 'TOKYO FASHION EXPRESS', + 'episode': 'The Kimono as Global Fashion', + }, + 'skip': 'Videos available only for a limited period of time', + } + _API_URL = 'http://api.nhk.or.jp/nhkworld/vodesdlist/v1/all/all/all.json?apikey=EJfK8jdS57GqlupFgAfAAwr573q01y6k' + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json(self._API_URL, video_id) + + try: + episode = next( + e for e in data['data']['episodes'] + if e.get('url') and video_id in e['url']) + except StopIteration: + raise ExtractorError('Unable to find episode') + + embed_code = episode['vod_id'] + + title = episode.get('sub_title_clean') or episode['sub_title'] + description = episode.get('description_clean') or episode.get('description') + series = episode.get('title_clean') or episode.get('title') + + return { + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:%s' % embed_code, + 'title': '%s - %s' % (series, title) if series and title else title, + 'description': description, + 'series': series, + 'episode': title, + } diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index b04d21113..62ce800c0 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -245,7 +245,11 @@ class NHLVideocenterCategoryIE(NHLBaseInfoExtractor): class NHLIE(InfoExtractor): IE_NAME = 'nhl.com' - _VALID_URL = r'https?://(?:www\.)?nhl\.com/([^/]+/)*c-(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?Pnhl|wch2016)\.com/(?:[^/]+/)*c-(?P\d+)' + _SITES_MAP = { + 'nhl': 'nhl', + 'wch2016': 'wch', + } _TESTS = [{ # type=video 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', @@ -270,13 +274,32 @@ class NHLIE(InfoExtractor): 'upload_date': '20160204', 'timestamp': 1454544904, }, + }, { + # Some m3u8 URLs are invalid (https://github.com/rg3/youtube-dl/issues/10713) + 'url': 'https://www.nhl.com/predators/video/poile-laviolette-on-subban-trade/t-277437416/c-44315003', + 'md5': '50b2bb47f405121484dda3ccbea25459', + 'info_dict': { + 'id': '44315003', + 'ext': 'mp4', + 'title': 'Poile, Laviolette on Subban trade', + 'description': 'General manager David Poile and head coach Peter Laviolette share their thoughts on acquiring P.K. Subban from Montreal (06/29/16)', + 'timestamp': 1467242866, + 'upload_date': '20160629', + }, + }, { + 'url': 'https://www.wch2016.com/video/caneur-best-of-game-2-micd-up/t-281230378/c-44983703', + 'only_matching': True, + }, { + 'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068', + 'only_matching': True, }] def _real_extract(self, url): - tmp_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + tmp_id, site = mobj.group('id'), mobj.group('site') video_data = self._download_json( - 'https://nhl.bamcontent.com/nhl/id/v1/%s/details/web-v1.json' % tmp_id, - tmp_id) + 'https://nhl.bamcontent.com/%s/id/v1/%s/details/web-v1.json' + % (self._SITES_MAP[site], tmp_id), tmp_id) if video_data.get('type') == 'article': video_data = video_data['media'] @@ -290,9 +313,11 @@ class NHLIE(InfoExtractor): continue ext = determine_ext(playback_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( playback_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=playback.get('name', 'hls'), fatal=False)) + m3u8_id=playback.get('name', 'hls'), fatal=False) + self._check_formats(m3u8_formats, video_id) + formats.extend(m3u8_formats) else: height = int_or_none(playback.get('height')) formats.append({ diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 9c54846e1..08a75929e 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -1,15 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse_urlencode from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): # None of videos on the website are still alive? IE_NAME = 'nick.com' - _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:(?:www|beta)\.)?nick(?:jr)?\.com/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' _TESTS = [{ 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', @@ -56,13 +57,16 @@ class NickIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/', 'only_matching': True, + }, { + 'url': 'http://beta.nick.com/nicky-ricky-dicky-and-dawn/videos/nicky-ricky-dicky-dawn-301-full-episode/', + 'only_matching': True, }] def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ + return { 'feed': 'nick_arc_player_prime', 'mgid': uri, - }) + } def _extract_mgid(self, webpage): return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') @@ -70,22 +74,53 @@ class NickIE(MTVServicesInfoExtractor): class NickDeIE(MTVServicesInfoExtractor): IE_NAME = 'nick.de' - _VALID_URL = r'https?://(?:www\.)?nick\.de/(?:playlist|shows)/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pnick\.de|nickelodeon\.(?:nl|at))/(?:playlist|shows)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', 'only_matching': True, }, { 'url': 'http://www.nick.de/shows/342-icarly', 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.nl/shows/474-spongebob/videos/17403-een-kijkje-in-de-keuken-met-sandy-van-binnenuit', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.at/playlist/3773-top-videos/videos/episode/77993-das-letzte-gefecht', + 'only_matching': True, }] + def _extract_mrss_url(self, webpage, host): + return update_url_query(self._search_regex( + r'data-mrss=(["\'])(?Phttp.+?)\1', webpage, 'mrss url', group='url'), + {'siteKey': host}) + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + host = mobj.group('host') webpage = self._download_webpage(url, video_id) - mrss_url = update_url_query(self._search_regex( - r'data-mrss=(["\'])(?Phttp.+?)\1', webpage, 'mrss url', group='url'), - {'siteKey': 'nick.de'}) + mrss_url = self._extract_mrss_url(webpage, host) return self._get_videos_info_from_url(mrss_url, video_id) + + +class NickNightIE(NickDeIE): + IE_NAME = 'nicknight' + _VALID_URL = r'https?://(?:www\.)(?Pnicknight\.(?:de|at|tv))/(?:playlist|shows)/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nicknight.at/shows/977-awkward/videos/85987-nimmer-beste-freunde', + 'only_matching': True, + }, { + 'url': 'http://www.nicknight.at/shows/977-awkward', + 'only_matching': True, + }, { + 'url': 'http://www.nicknight.at/shows/1900-faking-it', + 'only_matching': True, + }] + + def _extract_mrss_url(self, webpage, *args): + return self._search_regex( + r'mrss\s*:\s*(["\'])(?Phttp.+?)\1', webpage, + 'mrss url', group='url') diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index dd75a48af..8baac23e4 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -7,7 +7,6 @@ import datetime from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -40,6 +39,7 @@ class NiconicoIE(InfoExtractor): 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', 'duration': 33, }, + 'skip': 'Requires an account', }, { # File downloaded with and without credentials are different, so omit # the md5 field @@ -55,6 +55,7 @@ class NiconicoIE(InfoExtractor): 'timestamp': 1304065916, 'duration': 209, }, + 'skip': 'Requires an account', }, { # 'video exists but is marked as "deleted" # md5 is unstable @@ -65,9 +66,10 @@ class NiconicoIE(InfoExtractor): 'description': 'deleted', 'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>', 'upload_date': '20071224', - 'timestamp': 1198527840, # timestamp field has different value if logged in + 'timestamp': int, # timestamp field has different value if logged in 'duration': 304, }, + 'skip': 'Requires an account', }, { 'url': 'http://www.nicovideo.jp/watch/so22543406', 'info_dict': { @@ -79,13 +81,12 @@ class NiconicoIE(InfoExtractor): 'upload_date': '20140104', 'uploader': 'アニメロチャンネル', 'uploader_id': '312', - } + }, + 'skip': 'The viewing period of the video you were searching for has expired.', }] _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' - # Determine whether the downloader used authentication to download video - _AUTHENTICATED = False def _real_initialize(self): self._login() @@ -109,8 +110,6 @@ class NiconicoIE(InfoExtractor): if re.search(r'(?i)

Log in error

', login_results) is not None: self._downloader.report_warning('unable to log in: bad username or password') return False - # Successful login - self._AUTHENTICATED = True return True def _real_extract(self, url): @@ -128,35 +127,19 @@ class NiconicoIE(InfoExtractor): 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, note='Downloading video info page') - if self._AUTHENTICATED: - # Get flv info - flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', - video_id, 'Downloading flv info') - else: - # Get external player info - ext_player_info = self._download_webpage( - 'http://ext.nicovideo.jp/thumb_watch/' + video_id, video_id) - thumb_play_key = self._search_regex( - r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey') - - # Get flv info - flv_info_data = compat_urllib_parse_urlencode({ - 'k': thumb_play_key, - 'v': video_id - }) - flv_info_request = sanitized_Request( - 'http://ext.nicovideo.jp/thumb_watch', flv_info_data, - {'Content-Type': 'application/x-www-form-urlencoded'}) - flv_info_webpage = self._download_webpage( - flv_info_request, video_id, - note='Downloading flv info', errnote='Unable to download flv info') + # Get flv info + flv_info_webpage = self._download_webpage( + 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', + video_id, 'Downloading flv info') flv_info = compat_urlparse.parse_qs(flv_info_webpage) if 'url' not in flv_info: if 'deleted' in flv_info: raise ExtractorError('The video has been deleted.', expected=True) + elif 'closed' in flv_info: + raise ExtractorError('Niconico videos now require logging in', + expected=True) else: raise ExtractorError('Unable to find video URL') @@ -252,7 +235,7 @@ class NiconicoIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://www\.nicovideo\.jp/mylist/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/mylist/(?P\d+)' _TEST = { 'url': 'http://www.nicovideo.jp/mylist/27411728', diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index d889245ad..ec4d675e2 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -4,40 +4,36 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( parse_iso8601, - parse_duration, - ExtractorError + float_or_none, + ExtractorError, + int_or_none, ) -class NineCNineMediaIE(InfoExtractor): - _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' +class NineCNineMediaBaseIE(InfoExtractor): + _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' + + +class NineCNineMediaStackIE(NineCNineMediaBaseIE): + IE_NAME = '9c9media:stack' + _VALID_URL = r'9c9media:stack:(?P[^:]+):(?P\d+):(?P\d+):(?P\d+)' def _real_extract(self, url): - destination_code, video_id = re.match(self._VALID_URL, url).groups() - api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id) - content = self._download_json(api_base_url, video_id, query={ - '$include': '[contentpackages]', - }) - title = content['Name'] - if len(content['ContentPackages']) > 1: - raise ExtractorError('multiple content packages') - content_package = content['ContentPackages'][0] - stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id'] - stacks = self._download_json(stacks_base_url, video_id)['Items'] - if len(stacks) > 1: - raise ExtractorError('multiple stacks') - stack = stacks[0] - stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id']) + destination_code, content_id, package_id, stack_id = re.match(self._VALID_URL, url).groups() + stack_base_url_template = self._API_BASE_TEMPLATE + 'contentpackages/%s/stacks/%s/manifest.' + stack_base_url = stack_base_url_template % (destination_code, content_id, package_id, stack_id) + formats = [] formats.extend(self._extract_m3u8_formats( - stack_base_url + 'm3u8', video_id, 'mp4', + stack_base_url + 'm3u8', stack_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( - stack_base_url + 'f4m', video_id, + stack_base_url + 'f4m', stack_id, f4m_id='hds', fatal=False)) - mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False) + mp4_url = self._download_webpage(stack_base_url + 'pd', stack_id, fatal=False) if mp4_url: formats.append({ 'url': mp4_url, @@ -46,10 +42,86 @@ class NineCNineMediaIE(InfoExtractor): self._sort_formats(formats) return { - 'id': video_id, - 'title': title, - 'description': content.get('Desc') or content.get('ShortDesc'), - 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), - 'duration': parse_duration(content.get('BroadcastTime')), + 'id': stack_id, 'formats': formats, } + + +class NineCNineMediaIE(NineCNineMediaBaseIE): + IE_NAME = '9c9media' + _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' + + def _real_extract(self, url): + destination_code, content_id = re.match(self._VALID_URL, url).groups() + api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id) + content = self._download_json(api_base_url, content_id, query={ + '$include': '[Media,Season,ContentPackages]', + }) + title = content['Name'] + if len(content['ContentPackages']) > 1: + raise ExtractorError('multiple content packages') + content_package = content['ContentPackages'][0] + package_id = content_package['Id'] + content_package_url = api_base_url + 'contentpackages/%s/' % package_id + content_package = self._download_json(content_package_url, content_id) + + if content_package.get('Constraints', {}).get('Security', {}).get('Type') == 'adobe-drm': + raise ExtractorError('This video is DRM protected.', expected=True) + + stacks = self._download_json(content_package_url + 'stacks/', package_id)['Items'] + multistacks = len(stacks) > 1 + + thumbnails = [] + for image in content.get('Images', []): + image_url = image.get('Url') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('Width')), + 'height': int_or_none(image.get('Height')), + }) + + tags, categories = [], [] + for source_name, container in (('Tags', tags), ('Genres', categories)): + for e in content.get(source_name, []): + e_name = e.get('Name') + if not e_name: + continue + container.append(e_name) + + description = content.get('Desc') or content.get('ShortDesc') + season = content.get('Season', {}) + base_info = { + 'description': description, + 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), + 'episode_number': int_or_none(content.get('Episode')), + 'season': season.get('Name'), + 'season_number': season.get('Number'), + 'season_id': season.get('Id'), + 'series': content.get('Media', {}).get('Name'), + 'tags': tags, + 'categories': categories, + } + + entries = [] + for stack in stacks: + stack_id = compat_str(stack['Id']) + entry = { + '_type': 'url_transparent', + 'url': '9c9media:stack:%s:%s:%s:%s' % (destination_code, content_id, package_id, stack_id), + 'id': stack_id, + 'title': '%s_part%s' % (title, stack['Name']) if multistacks else title, + 'duration': float_or_none(stack.get('Duration')), + 'ie_key': 'NineCNineMediaStack', + } + entry.update(base_info) + entries.append(entry) + + return { + '_type': 'multi_video', + 'id': content_id, + 'title': title, + 'description': description, + 'entries': entries, + } diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py index faa577237..351bea7ba 100644 --- a/youtube_dl/extractor/ninenow.py +++ b/youtube_dl/extractor/ninenow.py @@ -44,7 +44,20 @@ class NineNowIE(InfoExtractor): page_data = self._parse_json(self._search_regex( r'window\.__data\s*=\s*({.*?});', webpage, 'page data'), display_id) - common_data = page_data.get('episode', {}).get('episode') or page_data.get('clip', {}).get('clip') + + for kind in ('episode', 'clip'): + current_key = page_data.get(kind, {}).get( + 'current%sKey' % kind.capitalize()) + if not current_key: + continue + cache = page_data.get(kind, {}).get('%sCache' % kind, {}) + if not cache: + continue + common_data = (cache.get(current_key) or list(cache.values())[0])[kind] + break + else: + raise ExtractorError('Unable to find video data') + video_data = common_data['video'] if video_data.get('drm'): diff --git a/youtube_dl/extractor/nobelprize.py b/youtube_dl/extractor/nobelprize.py new file mode 100644 index 000000000..4dfdb09d6 --- /dev/null +++ b/youtube_dl/extractor/nobelprize.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + mimetype2ext, + determine_ext, + update_url_query, + get_element_by_attribute, + int_or_none, +) + + +class NobelPrizeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nobelprize\.org/mediaplayer.*?\bid=(?P\d+)' + _TEST = { + 'url': 'http://www.nobelprize.org/mediaplayer/?id=2636', + 'md5': '04c81e5714bb36cc4e2232fee1d8157f', + 'info_dict': { + 'id': '2636', + 'ext': 'mp4', + 'title': 'Announcement of the 2016 Nobel Prize in Physics', + 'description': 'md5:05beba57f4f5a4bbd4cf2ef28fcff739', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + media = self._parse_json(self._search_regex( + r'(?s)var\s*config\s*=\s*({.+?});', webpage, + 'config'), video_id, js_to_json)['media'] + title = media['title'] + + formats = [] + for source in media.get('source', []): + source_src = source.get('src') + if not source_src: + continue + ext = mimetype2ext(source.get('type')) or determine_ext(source_src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_src, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(source_src, {'hdcore': '3.7.0'}), + video_id, f4m_id='hds', fatal=False)) + else: + formats.append({ + 'url': source_src, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': get_element_by_attribute('itemprop', 'description', webpage), + 'duration': int_or_none(media.get('duration')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 06f2bda07..70ff2ab36 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index af44c3bb5..61fe571df 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -1,8 +1,8 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from .screenwavemedia import ScreenwaveMediaIE +from .jwplatform import JWPlatformIE from ..utils import ( unified_strdate, @@ -25,7 +25,7 @@ class NormalbootsIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - 'add_ie': ['ScreenwaveMedia'], + 'add_ie': ['JWPlatform'], } def _real_extract(self, url): @@ -39,15 +39,13 @@ class NormalbootsIE(InfoExtractor): r'[A-Za-z]+, (?P.*)', webpage, 'date', fatal=False)) - screenwavemedia_url = self._html_search_regex( - ScreenwaveMediaIE.EMBED_PATTERN, webpage, 'screenwave URL', - group='url') + jwplatform_url = JWPlatformIE._extract_url(webpage) return { '_type': 'url_transparent', 'id': video_id, - 'url': screenwavemedia_url, - 'ie_key': ScreenwaveMediaIE.ie_key(), + 'url': jwplatform_url, + 'ie_key': JWPlatformIE.ie_key(), 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), diff --git a/youtube_dl/extractor/nosvideo.py b/youtube_dl/extractor/nosvideo.py index eab816e49..53c500c35 100644 --- a/youtube_dl/extractor/nosvideo.py +++ b/youtube_dl/extractor/nosvideo.py @@ -17,7 +17,7 @@ _x = lambda p: xpath_with_ns(p, {'xspf': 'http://xspf.org/ns/0/'}) class NosVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nosvideo\.com/' + \ - '(?:embed/|\?v=)(?P[A-Za-z0-9]{12})/?' + r'(?:embed/|\?v=)(?P[A-Za-z0-9]{12})/?' _PLAYLIST_URL = 'http://nosvideo.com/xml/{xml_id:s}.xml' _FILE_DELETED_REGEX = r'File Not Found' _TEST = { @@ -27,7 +27,7 @@ class NosVideoIE(InfoExtractor): 'id': 'mu8fle7g7rpq', 'ext': 'mp4', 'title': 'big_buck_bunny_480p_surround-fix.avi.mp4', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } } diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 17671ad39..06cb8cb3f 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -21,7 +21,7 @@ class NovaIE(InfoExtractor): 'ext': 'flv', 'title': 'Duel: Michal Hrdlička a Petr Suchoň', 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5', - 'thumbnail': 're:^https?://.*\.(?:jpg)', + 'thumbnail': r're:^https?://.*\.(?:jpg)', }, 'params': { # rtmp download @@ -36,7 +36,7 @@ class NovaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Podzemní nemocnice v pražské Krči', 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', - 'thumbnail': 're:^https?://.*\.(?:jpg)', + 'thumbnail': r're:^https?://.*\.(?:jpg)', } }, { 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove', @@ -46,7 +46,7 @@ class NovaIE(InfoExtractor): 'ext': 'flv', 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags - 'thumbnail': 're:^https?://.*\.(?:jpg)', + 'thumbnail': r're:^https?://.*\.(?:jpg)', }, 'params': { # rtmp download @@ -58,7 +58,7 @@ class NovaIE(InfoExtractor): 'id': '1756858', 'ext': 'flv', 'title': 'Televizní noviny - 30. 5. 2015', - 'thumbnail': 're:^https?://.*\.(?:jpg)', + 'thumbnail': r're:^https?://.*\.(?:jpg)', 'upload_date': '20150530', }, 'params': { @@ -72,7 +72,7 @@ class NovaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Zaklínač 3: Divoký hon', 'description': 're:.*Pokud se stejně jako my nemůžete.*', - 'thumbnail': 're:https?://.*\.jpg(\?.*)?', + 'thumbnail': r're:https?://.*\.jpg(\?.*)?', 'upload_date': '20150521', }, 'params': { diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 3bbd47355..829c71960 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -24,7 +24,7 @@ class NovaMovIE(InfoExtractor): ) (?P[a-z\d]{13}) ''' - _VALID_URL = _VALID_URL_TEMPLATE % {'host': 'novamov\.com'} + _VALID_URL = _VALID_URL_TEMPLATE % {'host': r'novamov\.com'} _HOST = 'www.novamov.com' @@ -104,7 +104,7 @@ class WholeCloudIE(NovaMovIE): IE_NAME = 'wholecloud' IE_DESC = 'WholeCloud' - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': '(?:wholecloud\.net|movshare\.(?:net|sx|ag))'} + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'(?:wholecloud\.net|movshare\.(?:net|sx|ag))'} _HOST = 'www.wholecloud.net' @@ -128,7 +128,7 @@ class NowVideoIE(NovaMovIE): IE_NAME = 'nowvideo' IE_DESC = 'NowVideo' - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} _HOST = 'www.nowvideo.to' @@ -152,7 +152,7 @@ class VideoWeedIE(NovaMovIE): IE_NAME = 'videoweed' IE_DESC = 'VideoWeed' - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'videoweed\.(?:es|com)'} + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'videoweed\.(?:es|com)'} _HOST = 'www.videoweed.es' @@ -176,7 +176,7 @@ class CloudTimeIE(NovaMovIE): IE_NAME = 'cloudtime' IE_DESC = 'CloudTime' - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'cloudtime\.to'} + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'cloudtime\.to'} _HOST = 'www.cloudtime.to' @@ -190,7 +190,7 @@ class AuroraVidIE(NovaMovIE): IE_NAME = 'auroravid' IE_DESC = 'AuroraVid' - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'auroravid\.to'} + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'auroravid\.to'} _HOST = 'www.auroravid.to' diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index 74860eb20..b6c5ee6e4 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .brightcove import ( @@ -62,7 +62,7 @@ class NownessIE(NownessBaseIE): 'ext': 'mp4', 'title': 'Candor: The Art of Gesticulation', 'description': 'Candor: The Art of Gesticulation', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1446745676, 'upload_date': '20151105', 'uploader_id': '2385340575001', @@ -76,7 +76,7 @@ class NownessIE(NownessBaseIE): 'ext': 'mp4', 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1407315371, 'upload_date': '20140806', 'uploader_id': '2385340575001', @@ -91,7 +91,7 @@ class NownessIE(NownessBaseIE): 'ext': 'mp4', 'title': 'Bleu, Blanc, Rouge - A Godard Supercut', 'description': 'md5:f0ea5f1857dffca02dbd37875d742cec', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'upload_date': '20150607', 'uploader': 'Cinema Sem Lei', 'uploader_id': 'cinemasemlei', diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 916a102bf..e43b37136 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -83,7 +83,7 @@ class NowTVIE(NowTVBaseIE): 'ext': 'flv', 'title': 'Inka Bause stellt die neuen Bauern vor', 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1432580700, 'upload_date': '20150525', 'duration': 2786, @@ -101,7 +101,7 @@ class NowTVIE(NowTVBaseIE): 'ext': 'flv', 'title': 'Berlin - Tag & Nacht (Folge 934)', 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1432666800, 'upload_date': '20150526', 'duration': 2641, @@ -119,7 +119,7 @@ class NowTVIE(NowTVBaseIE): 'ext': 'flv', 'title': 'Hals- und Beinbruch', 'description': 'md5:b50d248efffe244e6f56737f0911ca57', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1432415400, 'upload_date': '20150523', 'duration': 2742, @@ -137,7 +137,7 @@ class NowTVIE(NowTVBaseIE): 'ext': 'flv', 'title': 'Angst!', 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1222632900, 'upload_date': '20080928', 'duration': 3025, @@ -155,7 +155,7 @@ class NowTVIE(NowTVBaseIE): 'ext': 'flv', 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1432751700, 'upload_date': '20150527', 'duration': 1083, @@ -173,7 +173,7 @@ class NowTVIE(NowTVBaseIE): 'ext': 'flv', 'title': "Büro-Fall / Chihuahua 'Joel'", 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1432408200, 'upload_date': '20150523', 'duration': 3092, diff --git a/youtube_dl/extractor/noz.py b/youtube_dl/extractor/noz.py index c47a33d15..ccafd7723 100644 --- a/youtube_dl/extractor/noz.py +++ b/youtube_dl/extractor/noz.py @@ -24,7 +24,7 @@ class NozIE(InfoExtractor): 'duration': 215, 'title': '3:2 - Deutschland gewinnt Badminton-Länderspiel in Melle', 'description': 'Vor rund 370 Zuschauern gewinnt die deutsche Badminton-Nationalmannschaft am Donnerstag ein EM-Vorbereitungsspiel gegen Frankreich in Melle. Video Moritz Frankenberg.', - 'thumbnail': 're:^http://.*\.jpg', + 'thumbnail': r're:^http://.*\.jpg', }, }] diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 87f5675c7..962437145 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -3,12 +3,15 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( fix_xml_ampersands, + orderedSet, parse_duration, qualities, strip_jsonp, unified_strdate, + ExtractorError, ) @@ -180,9 +183,16 @@ class NPOIE(NPOBaseIE): continue streams = format_info.get('streams') if streams: - video_info = self._download_json( - streams[0] + '&type=json', - video_id, 'Downloading %s stream JSON' % format_id) + try: + video_info = self._download_json( + streams[0] + '&type=json', + video_id, 'Downloading %s stream JSON' % format_id) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error = (self._parse_json(ee.cause.read().decode(), video_id, fatal=False) or {}).get('errorstring') + if error: + raise ExtractorError(error, expected=True) + raise else: video_info = format_info video_url = video_info.get('url') @@ -231,7 +241,7 @@ class NPOIE(NPOBaseIE): if metadata.get('tt888') == 'ja': subtitles['nl'] = [{ 'ext': 'vtt', - 'url': 'http://e.omroep.nl/tt888/%s' % video_id, + 'url': 'http://tt888.omroep.nl/tt888/%s' % video_id, }] return { @@ -429,7 +439,7 @@ class SchoolTVIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( - r'data-mid=(["\'])(?P.+?)\1', webpage, 'video_id', group='id') + r'data-mid=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video_id', group='id') return { '_type': 'url_transparent', 'ie_key': 'NPO', @@ -438,9 +448,30 @@ class SchoolTVIE(InfoExtractor): } -class VPROIE(NPOIE): +class NPOPlaylistBaseIE(NPOIE): + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) + for video_id in orderedSet(re.findall(self._PLAYLIST_ENTRY_RE, webpage)) + ] + + playlist_title = self._html_search_regex( + self._PLAYLIST_TITLE_RE, webpage, 'playlist title', + default=None) or self._og_search_title(webpage) + + return self.playlist_result(entries, playlist_id, playlist_title) + + +class VPROIE(NPOPlaylistBaseIE): IE_NAME = 'vpro' - _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?:(?:tegenlicht\.)?vpro|2doc)\.nl/(?:[^/]+/)*(?P[^/]+)\.html' + _PLAYLIST_TITLE_RE = (r']+class=["\'].*?\bmedia-platform-title\b.*?["\'][^>]*>([^<]+)', + r']+class=["\'].*?\bmedia-platform-subtitle\b.*?["\'][^>]*>([^<]+)') + _PLAYLIST_ENTRY_RE = r'data-media-id="([^"]+)"' _TESTS = [ { @@ -453,12 +484,13 @@ class VPROIE(NPOIE): 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, + 'skip': 'Video gone', }, { 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', 'info_dict': { 'id': 'sergio-herman', - 'title': 'Sergio Herman: Fucking perfect', + 'title': 'sergio herman: fucking perfect', }, 'playlist_count': 2, }, @@ -467,54 +499,61 @@ class VPROIE(NPOIE): 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html', 'info_dict': { 'id': 'education-education', - 'title': '2Doc', + 'title': 'education education', }, 'playlist_count': 2, + }, + { + 'url': 'http://www.2doc.nl/documentaires/series/2doc/2015/oktober/de-tegenprestatie.html', + 'info_dict': { + 'id': 'de-tegenprestatie', + 'title': 'De Tegenprestatie', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.2doc.nl/speel~VARA_101375237~mh17-het-verdriet-van-nederland~.html', + 'info_dict': { + 'id': 'VARA_101375237', + 'ext': 'm4v', + 'title': 'MH17: Het verdriet van Nederland', + 'description': 'md5:09e1a37c1fdb144621e22479691a9f18', + 'upload_date': '20150716', + }, + 'params': { + # Skip because of m3u8 download + 'skip_download': True + }, } ] - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - - entries = [ - self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) - for video_id in re.findall(r'data-media-id="([^"]+)"', webpage) - ] - - playlist_title = self._search_regex( - r'\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*', - webpage, 'playlist title', default=None) or self._og_search_title(webpage) - - return self.playlist_result(entries, playlist_id, playlist_title) - - -class WNLIE(InfoExtractor): +class WNLIE(NPOPlaylistBaseIE): + IE_NAME = 'wnl' _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P[^/]+)__\d+' + _PLAYLIST_TITLE_RE = r'(?s)]+class="subject"[^>]*>(.+?)' + _PLAYLIST_ENTRY_RE = r']+href="([^"]+)"[^>]+class="js-mid"[^>]*>Deel \d+' - _TEST = { + _TESTS = [{ 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515', 'info_dict': { 'id': 'vandaag-de-dag-6-mei', 'title': 'Vandaag de Dag 6 mei', }, 'playlist_count': 4, - } + }] - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) +class AndereTijdenIE(NPOPlaylistBaseIE): + IE_NAME = 'anderetijden' + _VALID_URL = r'https?://(?:www\.)?anderetijden\.nl/programma/(?:[^/]+/)+(?P[^/?#&]+)' + _PLAYLIST_TITLE_RE = r'(?s)]+class=["\'].*?\bpage-title\b.*?["\'][^>]*>(.+?)' + _PLAYLIST_ENTRY_RE = r']+class=["\']episode-container episode-page["\'][^>]+data-prid=["\'](.+?)["\']' - entries = [ - self.url_result('npo:%s' % video_id, 'NPO') - for video_id, part in re.findall( - r']+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage) - ] - - playlist_title = self._html_search_regex( - r'(?s)]+class="subject"[^>]*>(.+?)', - webpage, 'playlist title') - - return self.playlist_result(entries, playlist_id, playlist_title) + _TESTS = [{ + 'url': 'http://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'info_dict': { + 'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'title': 'Duitse soldaten over de Slag bij Arnhem', + }, + 'playlist_count': 3, + }] diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 6ded5bd45..fc3c0cd3c 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -1,6 +1,7 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import random import re from .common import InfoExtractor @@ -14,15 +15,24 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): - def _extract_formats(self, manifest_url, video_id, fatal=True): - formats = [] - formats.extend(self._extract_f4m_formats( - manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', - video_id, f4m_id='hds', fatal=fatal)) - formats.extend(self._extract_m3u8_formats(manifest_url.replace( - 'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=fatal)) - return formats + _faked_ip = None + + def _download_webpage_handle(self, *args, **kwargs): + # NRK checks X-Forwarded-For HTTP header in order to figure out the + # origin of the client behind proxy. This allows to bypass geo + # restriction by faking this header's value to some Norway IP. + # We will do so once we encounter any geo restriction error. + if self._faked_ip: + # NB: str is intentional + kwargs.setdefault(str('headers'), {})['X-Forwarded-For'] = self._faked_ip + return super(NRKBaseIE, self)._download_webpage_handle(*args, **kwargs) + + def _fake_ip(self): + # Use fake IP from 37.191.128.0/17 in order to workaround geo + # restriction + def octet(lb=0, ub=255): + return random.randint(lb, ub) + self._faked_ip = '37.191.%d.%d' % (octet(128), octet()) def _real_extract(self, url): video_id = self._match_id(url) @@ -34,8 +44,17 @@ class NRKBaseIE(InfoExtractor): title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id + http_headers = {'X-Forwarded-For': self._faked_ip} if self._faked_ip else {} + entries = [] + conviva = data.get('convivaStatistics') or {} + live = (data.get('mediaElementType') == 'Live' or + data.get('isLive') is True or conviva.get('isLive')) + + def make_title(t): + return self._live_title(t) if live else t + media_assets = data.get('mediaAssets') if media_assets and isinstance(media_assets, list): def video_id_and_title(idx): @@ -45,10 +64,17 @@ class NRKBaseIE(InfoExtractor): asset_url = asset.get('url') if not asset_url: continue - formats = self._extract_formats(asset_url, video_id, fatal=False) + formats = self._extract_akamai_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) + + # Some f4m streams may not work with hdcore in fragments' URLs + for f in formats: + extra_param = f.get('extra_param_to_segment_url') + if extra_param and 'hdcore' in extra_param: + del f['extra_param_to_segment_url'] + entry_id, entry_title = video_id_and_title(num) duration = parse_duration(asset.get('duration')) subtitles = {} @@ -60,35 +86,64 @@ class NRKBaseIE(InfoExtractor): }) entries.append({ 'id': asset.get('carrierId') or entry_id, - 'title': entry_title, + 'title': make_title(entry_title), 'duration': duration, 'subtitles': subtitles, 'formats': formats, + 'http_headers': http_headers, }) if not entries: media_url = data.get('mediaUrl') if media_url: - formats = self._extract_formats(media_url, video_id) + formats = self._extract_akamai_formats(media_url, video_id) self._sort_formats(formats) duration = parse_duration(data.get('duration')) entries = [{ 'id': video_id, - 'title': title, + 'title': make_title(title), 'duration': duration, 'formats': formats, }] if not entries: - if data.get('usageRights', {}).get('isGeoBlocked'): - raise ExtractorError( - 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - expected=True) + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type and not self._faked_ip: + self.report_warning( + 'Video is geo restricted, trying to fake IP') + self._fake_ip() + return self._real_extract(url) + + MESSAGES = { + 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', + 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + } + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, MESSAGES.get( + message_type, message_type)), + expected=True) - conviva = data.get('convivaStatistics') or {} series = conviva.get('seriesName') or data.get('seriesTitle') episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') + season_number = None + episode_number = None + if data.get('mediaElementType') == 'Episode': + _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ + data.get('relativeOriginUrl', '') + EPISODENUM_RE = [ + r'/s(?P\d{,2})e(?P\d{,2})\.', + r'/sesong-(?P\d{,2})/episode-(?P\d{,2})', + ] + season_number = int_or_none(self._search_regex( + EPISODENUM_RE, _season_episode, 'season number', + default=None, group='season')) + episode_number = int_or_none(self._search_regex( + EPISODENUM_RE, _season_episode, 'episode number', + default=None, group='episode')) + thumbnails = None images = data.get('images') if images and isinstance(images, dict): @@ -101,11 +156,15 @@ class NRKBaseIE(InfoExtractor): } for image in web_images if image.get('imageUrl')] description = data.get('description') + category = data.get('mediaAnalytics', {}).get('category') common_info = { 'description': description, 'series': series, 'episode': episode, + 'season_number': season_number, + 'episode_number': episode_number, + 'categories': [category] if category else None, 'age_limit': parse_age_limit(data.get('legalAge')), 'thumbnails': thumbnails, } @@ -123,7 +182,17 @@ class NRKBaseIE(InfoExtractor): class NRKIE(NRKBaseIE): - _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P\d+)' + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/PS\*| + v8-psapi\.nrk\.no/mediaelement/ + ) + ) + (?P[^/?#&]+) + ''' _API_HOST = 'v8.psapi.nrk.no' _TESTS = [{ # video @@ -147,12 +216,26 @@ class NRKIE(NRKBaseIE): 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', 'duration': 20, } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, }] class NRKTVIE(NRKBaseIE): IE_DESC = 'NRK TV and NRK Radio' - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P\d+))?' + _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' + _VALID_URL = r'''(?x) + https?:// + (?:tv|radio)\.nrk(?:super)?\.no/ + (?:serie/[^/]+|program)/ + (?![Ee]pisodes)%s + (?:/\d{2}-\d{2}-\d{4})? + (?:\#del=(?P\d+))? + ''' % _EPISODE_RE _API_HOST = 'psapi-we.nrk.no' _TESTS = [{ @@ -164,63 +247,145 @@ class NRKTVIE(NRKBaseIE): 'title': '20 spørsmål 23.05.2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, + 'series': '20 spørsmål - TV', + 'episode': '23.05.2014', }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'md5': '43d0be26663d380603a9cf0c24366531', 'info_dict': { 'id': 'MDFP15000514CA', 'ext': 'mp4', 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', 'duration': 4605, + 'series': 'Kunnskapskanalen', + 'episode': '24.05.2014', + }, + 'params': { + 'skip_download': True, }, }, { # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { 'id': 'MSPO40010515-part2', 'ext': 'flv', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', }, - 'skip': 'Only works from Norway', + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Video is geo restricted'], + 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'playlist': [{ - 'md5': '9480285eff92d64f06e02a5367970a7a', 'info_dict': { - 'id': 'MSPO40010515-part1', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'duration': 772, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, }, }, { - 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515BH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'duration': 6175, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, }, }], 'info_dict': { 'id': 'MSPO40010515', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'duration': 6947.52, + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + }, + 'expected_warnings': ['Video is geo restricted'], + }, { + 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', + 'info_dict': { + 'id': 'KMTE50001317AA', + 'ext': 'mp4', + 'title': 'Anno 13:30', + 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', + 'duration': 2340, + 'series': 'Anno', + 'episode': '13:30', + 'season_number': 3, + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', + 'info_dict': { + 'id': 'MUHH46000317AA', + 'ext': 'mp4', + 'title': 'Nytt på Nytt 27.01.2017', + 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', + 'duration': 1796, + 'series': 'Nytt på nytt', + 'episode': '27.01.2017', + }, + 'params': { + 'skip_download': True, }, - 'skip': 'Only works from Norway', }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, }] -class NRKPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P[^/]+)' +class NRKTVDirekteIE(NRKTVIE): + IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' + _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://tv.nrk.no/direkte/nrk1', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/direkte/p1_oslo_akershus', + 'only_matching': True, + }] + + +class NRKPlaylistBaseIE(InfoExtractor): + def _extract_description(self, webpage): + pass + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('nrk:%s' % video_id, NRKIE.ie_key()) + for video_id in re.findall(self._ITEM_RE, webpage) + ] + + playlist_title = self. _extract_title(webpage) + playlist_description = self._extract_description(webpage) + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + + +class NRKPlaylistIE(NRKPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P[^/]+)' + _ITEM_RE = r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"' _TESTS = [{ 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', 'info_dict': { @@ -239,23 +404,86 @@ class NRKPlaylistIE(InfoExtractor): 'playlist_count': 5, }] - def _real_extract(self, url): - playlist_id = self._match_id(url) + def _extract_title(self, webpage): + return self._og_search_title(webpage, fatal=False) - webpage = self._download_webpage(url, playlist_id) + def _extract_description(self, webpage): + return self._og_search_description(webpage) + + +class NRKTVEpisodesIE(NRKPlaylistBaseIE): + _VALID_URL = r'https?://tv\.nrk\.no/program/[Ee]pisodes/[^/]+/(?P\d+)' + _ITEM_RE = r'data-episode=["\']%s' % NRKTVIE._EPISODE_RE + _TESTS = [{ + 'url': 'https://tv.nrk.no/program/episodes/nytt-paa-nytt/69031', + 'info_dict': { + 'id': '69031', + 'title': 'Nytt på nytt, sesong: 201210', + }, + 'playlist_count': 4, + }] + + def _extract_title(self, webpage): + return self._html_search_regex( + r'

([^<]+)

', webpage, 'title', fatal=False) + + +class NRKTVSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' + _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' + _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 9, + }, { + 'url': 'http://tv.nrksuper.no/serie/labyrint', + 'info_dict': { + 'id': 'labyrint', + 'title': 'Labyrint', + 'description': 'md5:58afd450974c89e27d5a19212eee7115', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/saving-the-human-race', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/postmann-pat', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url) + + def _real_extract(self, url): + series_id = self._match_id(url) + + webpage = self._download_webpage(url, series_id) entries = [ - self.url_result('nrk:%s' % video_id, 'NRK') - for video_id in re.findall( - r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"', - webpage) + self.url_result( + 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( + series=series_id, season=season_id)) + for season_id in re.findall(self._ITEM_RE, webpage) ] - playlist_title = self._og_search_title(webpage) - playlist_description = self._og_search_description(webpage) + title = self._html_search_meta( + 'seriestitle', webpage, + 'title', default=None) or self._og_search_title( + webpage, fatal=False) - return self.playlist_result( - entries, playlist_id, playlist_title, playlist_description) + description = self._html_search_meta( + 'series_description', webpage, + 'description', default=None) or self._og_search_description(webpage) + + return self.playlist_result(entries, series_id, title, description) class NRKSkoleIE(InfoExtractor): diff --git a/youtube_dl/extractor/ntvde.py b/youtube_dl/extractor/ntvde.py index a83e85cb8..101a5374c 100644 --- a/youtube_dl/extractor/ntvde.py +++ b/youtube_dl/extractor/ntvde.py @@ -1,6 +1,8 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( @@ -20,7 +22,7 @@ class NTVDeIE(InfoExtractor): 'info_dict': { 'id': '14438086', 'ext': 'mp4', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'title': 'Schnee und Glätte führen zu zahlreichen Unfällen und Staus', 'alt_title': 'Winterchaos auf deutschen Straßen', 'description': 'Schnee und Glätte sorgen deutschlandweit für einen chaotischen Start in die Woche: Auf den Straßen kommt es zu kilometerlangen Staus und Dutzenden Glätteunfällen. In Düsseldorf und München wirbelt der Schnee zudem den Flugplan durcheinander. Dutzende Flüge landen zu spät, einige fallen ganz aus.', @@ -40,8 +42,8 @@ class NTVDeIE(InfoExtractor): timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp')) vdata = self._parse_json(self._search_regex( r'(?s)\$\(\s*"\#player"\s*\)\s*\.data\(\s*"player",\s*(\{.*?\})\);', - webpage, 'player data'), - video_id, transform_source=js_to_json) + webpage, 'player data'), video_id, + transform_source=lambda s: js_to_json(re.sub(r'advertising:\s*{[^}]+},', '', s))) duration = parse_duration(vdata.get('duration')) formats = [] diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py index e8702ebcd..4f9cedb84 100644 --- a/youtube_dl/extractor/ntvru.py +++ b/youtube_dl/extractor/ntvru.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -21,7 +21,7 @@ class NTVRuIE(InfoExtractor): 'ext': 'mp4', 'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', - 'thumbnail': 're:^http://.*\.jpg', + 'thumbnail': r're:^http://.*\.jpg', 'duration': 136, }, }, { @@ -32,7 +32,7 @@ class NTVRuIE(InfoExtractor): 'ext': 'mp4', 'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', - 'thumbnail': 're:^http://.*\.jpg', + 'thumbnail': r're:^http://.*\.jpg', 'duration': 172, }, }, { @@ -43,7 +43,7 @@ class NTVRuIE(InfoExtractor): 'ext': 'mp4', 'title': '«Сегодня». 21 марта 2014 года. 16:00', 'description': '«Сегодня». 21 марта 2014 года. 16:00', - 'thumbnail': 're:^http://.*\.jpg', + 'thumbnail': r're:^http://.*\.jpg', 'duration': 1496, }, }, { @@ -54,7 +54,7 @@ class NTVRuIE(InfoExtractor): 'ext': 'mp4', 'title': 'Остросюжетный фильм «Кома»', 'description': 'Остросюжетный фильм «Кома»', - 'thumbnail': 're:^http://.*\.jpg', + 'thumbnail': r're:^http://.*\.jpg', 'duration': 5592, }, }, { @@ -65,7 +65,7 @@ class NTVRuIE(InfoExtractor): 'ext': 'mp4', 'title': '«Дело врачей»: «Деревце жизни»', 'description': '«Дело врачей»: «Деревце жизни»', - 'thumbnail': 're:^http://.*\.jpg', + 'thumbnail': r're:^http://.*\.jpg', 'duration': 2590, }, }] diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py index ef093dec2..87fb94d1f 100644 --- a/youtube_dl/extractor/nuevo.py +++ b/youtube_dl/extractor/nuevo.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 681683e86..2bb77ab24 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -1,26 +1,40 @@ +# coding: utf-8 from __future__ import unicode_literals +import hmac +import hashlib +import base64 + from .common import InfoExtractor from ..utils import ( + determine_ext, float_or_none, int_or_none, + js_to_json, + mimetype2ext, parse_iso8601, + remove_start, ) class NYTimesBaseIE(InfoExtractor): + _SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v' + def _extract_video_from_id(self, video_id): - video_data = self._download_json( - 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, - video_id, 'Downloading video JSON') + # Authorization generation algorithm is reverse engineered from `signer` in + # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js + path = '/svc/video/api/v3/video/' + video_id + hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest() + video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={ + 'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(), + 'X-NYTV': 'vhs', + }, fatal=False) + if not video_data: + video_data = self._download_json( + 'http://www.nytimes.com/svc/video/api/v2/video/' + video_id, + video_id, 'Downloading video JSON') title = video_data['headline'] - description = video_data.get('summary') - duration = float_or_none(video_data.get('duration'), 1000) - - uploader = video_data.get('byline') - publication_date = video_data.get('publication_date') - timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None def get_file_size(file_size): if isinstance(file_size, int): @@ -28,35 +42,59 @@ class NYTimesBaseIE(InfoExtractor): elif isinstance(file_size, dict): return int(file_size.get('value', 0)) else: - return 0 + return None - formats = [ - { - 'url': video['url'], - 'format_id': video.get('type'), - 'vcodec': video.get('video_codec'), - 'width': int_or_none(video.get('width')), - 'height': int_or_none(video.get('height')), - 'filesize': get_file_size(video.get('fileSize')), - } for video in video_data['renditions'] if video.get('url') - ] + urls = [] + formats = [] + for video in video_data.get('renditions', []): + video_url = video.get('url') + format_id = video.get('type') + if not video_url or format_id == 'thumbs' or video_url in urls: + continue + urls.append(video_url) + ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id or 'hls', fatal=False)) + elif ext == 'mpd': + continue + # formats.extend(self._extract_mpd_formats( + # video_url, video_id, format_id or 'dash', fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'vcodec': video.get('videoencoding') or video.get('video_codec'), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'filesize': get_file_size(video.get('file_size') or video.get('fileSize')), + 'tbr': int_or_none(video.get('bitrate'), 1000), + 'ext': ext, + }) self._sort_formats(formats) - thumbnails = [ - { - 'url': 'http://www.nytimes.com/%s' % image['url'], + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('url') + if not image_url: + continue + thumbnails.append({ + 'url': 'http://www.nytimes.com/' + image_url, 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), - } for image in video_data.get('images', []) if image.get('url') - ] + }) + + publication_date = video_data.get('publication_date') + timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None return { 'id': video_id, 'title': title, - 'description': description, + 'description': video_data.get('summary'), 'timestamp': timestamp, - 'uploader': uploader, - 'duration': duration, + 'uploader': video_data.get('byline'), + 'duration': float_or_none(video_data.get('duration'), 1000), 'formats': formats, 'thumbnails': thumbnails, } @@ -67,7 +105,7 @@ class NYTimesIE(NYTimesBaseIE): _TESTS = [{ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', - 'md5': '18a525a510f942ada2720db5f31644c0', + 'md5': 'd665342765db043f7e225cff19df0f2d', 'info_dict': { 'id': '100000002847155', 'ext': 'mov', @@ -103,16 +141,83 @@ class NYTimesArticleIE(NYTimesBaseIE): 'upload_date': '20150414', 'uploader': 'Matthew Williams', } + }, { + 'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html', + 'md5': 'e0d52040cafb07662acf3c9132db3575', + 'info_dict': { + 'id': '100000004709062', + 'title': 'The Run-Up: ‘He Was Like an Octopus’', + 'ext': 'mp3', + 'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4', + 'series': 'The Run-Up', + 'episode': '‘He Was Like an Octopus’', + 'episode_number': 20, + 'duration': 2130, + } + }, { + 'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html', + 'info_dict': { + 'id': '100000004709479', + 'title': 'The Rise of Hitler', + 'ext': 'mp3', + 'description': 'md5:bce877fd9e3444990cb141875fab0028', + 'creator': 'Pamela Paul', + 'duration': 3475, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1', 'only_matching': True, }] + def _extract_podcast_from_json(self, json, page_id, webpage): + podcast_audio = self._parse_json( + json, page_id, transform_source=js_to_json) + + audio_data = podcast_audio['data'] + track = audio_data['track'] + + episode_title = track['title'] + video_url = track['source'] + + description = track.get('description') or self._html_search_meta( + ['og:description', 'twitter:description'], webpage) + + podcast_title = audio_data.get('podcast', {}).get('title') + title = ('%s: %s' % (podcast_title, episode_title) + if podcast_title else episode_title) + + episode = audio_data.get('podcast', {}).get('episode') or '' + episode_number = int_or_none(self._search_regex( + r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None)) + + return { + 'id': remove_start(podcast_audio.get('target'), 'FT') or page_id, + 'url': video_url, + 'title': title, + 'description': description, + 'creator': track.get('credit'), + 'series': podcast_title, + 'episode': episode_title, + 'episode_number': episode_number, + 'duration': int_or_none(track.get('duration')), + } + def _real_extract(self, url): - video_id = self._match_id(url) + page_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, page_id) - video_id = self._html_search_regex(r'data-videoid="(\d+)"', webpage, 'video id') + video_id = self._search_regex( + r'data-videoid=["\'](\d+)', webpage, 'video id', + default=None, fatal=False) + if video_id is not None: + return self._extract_video_from_id(video_id) - return self._extract_video_from_id(video_id) + podcast_data = self._search_regex( + (r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*\d+)' + _TEST = { + 'url': 'http://www.nzz.ch/zuerich/gymizyte/gymizyte-schreiben-schueler-heute-noch-diktate-ld.9153', + 'info_dict': { + 'id': '9153', + }, + 'playlist_mincount': 6, + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + entries = [] + for player_element in re.findall(r'(<[^>]+class="kalturaPlayer"[^>]*>)', webpage): + player_params = extract_attributes(player_element) + if player_params.get('data-type') not in ('kaltura_singleArticle',): + self.report_warning('Unsupported player type') + continue + entry_id = player_params['data-id'] + entries.append(self.url_result( + 'kaltura:1750922:' + entry_id, 'Kaltura', entry_id)) + + return self.playlist_result(entries, page_id) diff --git a/youtube_dl/extractor/oktoberfesttv.py b/youtube_dl/extractor/oktoberfesttv.py index 4a41c0542..a914068f9 100644 --- a/youtube_dl/extractor/oktoberfesttv.py +++ b/youtube_dl/extractor/oktoberfesttv.py @@ -1,11 +1,11 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class OktoberfestTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P[^/?#]+)' _TEST = { 'url': 'http://www.oktoberfest-tv.de/de/kameras/video/hb-zelt', @@ -13,7 +13,7 @@ class OktoberfestTVIE(InfoExtractor): 'id': 'hb-zelt', 'ext': 'mp4', 'title': 're:^Live-Kamera: Hofbräuzelt [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'is_live': True, }, 'params': { diff --git a/youtube_dl/extractor/ondemandkorea.py b/youtube_dl/extractor/ondemandkorea.py new file mode 100644 index 000000000..de1d6b08a --- /dev/null +++ b/youtube_dl/extractor/ondemandkorea.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + ExtractorError, + js_to_json, +) + + +class OnDemandKoreaIE(JWPlatformBaseIE): + _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P[^/]+)\.html' + _TEST = { + 'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html', + 'info_dict': { + 'id': 'ask-us-anything-e43', + 'ext': 'mp4', + 'title': 'Ask Us Anything : E43', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': 'm3u8 download' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, fatal=False) + + if not webpage: + # Page sometimes returns captcha page with HTTP 403 + raise ExtractorError( + 'Unable to access page. You may have been blocked.', + expected=True) + + if 'msg_block_01.png' in webpage: + self.raise_geo_restricted( + 'This content is not available in your region') + + if 'This video is only available to ODK PLUS members.' in webpage: + raise ExtractorError( + 'This video is only available to ODK PLUS members.', + expected=True) + + title = self._og_search_title(webpage) + + jw_config = self._parse_json( + self._search_regex( + r'(?s)jwplayer\(([\'"])(?:(?!\1).)+\1\)\.setup\s*\((?P.+?)\);', + webpage, 'jw config', group='options'), + video_id, transform_source=js_to_json) + info = self._parse_jwplayer_data( + jw_config, video_id, require_title=False, m3u8_id='hls', + base_url=url) + + info.update({ + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + }) + return info diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index fc22ad5eb..0a501b3e5 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -56,8 +56,8 @@ class OnetBaseIE(InfoExtractor): continue ext = determine_ext(video_url) if format_id == 'ism': - # TODO: Support Microsoft Smooth Streaming - continue + formats.extend(self._extract_ism_formats( + video_url, video_id, 'mss', fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id='dash', fatal=False)) @@ -90,7 +90,7 @@ class OnetBaseIE(InfoExtractor): class OnetIE(OnetBaseIE): - _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' + _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' IE_NAME = 'onet.tv' _TEST = { diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index 6fb1a3fcc..1d336cf30 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -22,7 +22,7 @@ class OnionStudiosIE(InfoExtractor): 'id': '2937', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'The A.V. Club', 'uploader_id': 'the-av-club', }, diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 2038a6ba5..84be2b1e3 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -18,7 +18,7 @@ class OoyalaBaseIE(InfoExtractor): _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/' _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s?' - def _extract(self, content_tree_url, video_id, domain='example.org'): + def _extract(self, content_tree_url, video_id, domain='example.org', supportedformats=None, embed_token=None): content_tree = self._download_json(content_tree_url, video_id)['content_tree'] metadata = content_tree[list(content_tree)[0]] embed_code = metadata['embed_code'] @@ -29,7 +29,8 @@ class OoyalaBaseIE(InfoExtractor): self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + compat_urllib_parse_urlencode({ 'domain': domain, - 'supportedFormats': 'mp4,rtmp,m3u8,hds', + 'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds,dash,smooth', + 'embedToken': embed_token, }), video_id) cur_auth_data = auth_data['authorization_data'][embed_code] @@ -47,11 +48,17 @@ class OoyalaBaseIE(InfoExtractor): delivery_type = stream['delivery_type'] if delivery_type == 'hls' or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - s_url, embed_code, 'mp4', 'm3u8_native', + re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif delivery_type == 'hds' or ext == 'f4m': formats.extend(self._extract_f4m_formats( s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) + elif delivery_type == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + s_url, embed_code, mpd_id='dash', fatal=False)) + elif delivery_type == 'smooth': + self._extract_ism_formats( + s_url, embed_code, ism_id='mss', fatal=False) elif ext == 'smil': formats.extend(self._extract_smil_formats( s_url, embed_code, fatal=False)) @@ -145,8 +152,10 @@ class OoyalaIE(OoyalaBaseIE): url, smuggled_data = unsmuggle_url(url, {}) embed_code = self._match_id(url) domain = smuggled_data.get('domain') + supportedformats = smuggled_data.get('supportedformats') + embed_token = smuggled_data.get('embed_token') content_tree_url = self._CONTENT_TREE_BASE + 'embed_code/%s/%s' % (embed_code, embed_code) - return self._extract(content_tree_url, embed_code, domain) + return self._extract(content_tree_url, embed_code, domain, supportedformats, embed_token) class OoyalaExternalIE(OoyalaBaseIE): diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 6415b8fdc..32289d897 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -7,14 +7,12 @@ from .common import InfoExtractor from ..compat import compat_chr from ..utils import ( determine_ext, - encode_base_n, ExtractorError, - mimetype2ext, ) class OpenloadIE(InfoExtractor): - _VALID_URL = r'https://openload.(?:co|io)/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:openload\.(?:co|io)|oload\.tv)/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -23,7 +21,23 @@ class OpenloadIE(InfoExtractor): 'id': 'kUEfGclsU9o', 'ext': 'mp4', 'title': 'skyrim_no-audio_1080.mp4', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://openload.co/embed/rjC09fkPLYs', + 'info_dict': { + 'id': 'rjC09fkPLYs', + 'ext': 'mp4', + 'title': 'movie.mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': { + 'en': [{ + 'ext': 'vtt', + }], + }, + }, + 'params': { + 'skip_download': True, # test subtitles only }, }, { 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4', @@ -39,92 +53,55 @@ class OpenloadIE(InfoExtractor): # for title and ext 'url': 'https://openload.co/embed/Sxz5sADo82g/', 'only_matching': True, + }, { + 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', + 'only_matching': True, }] @staticmethod - def openload_level2_debase(m): - radix, num = int(m.group(1)) + 27, int(m.group(2)) - return '"' + encode_base_n(num, radix) + '"' - - @classmethod - def openload_level2(cls, txt): - # The function name is ǃ \u01c3 - # Using escaped unicode literals does not work in Python 3.2 - return re.sub(r'ǃ\((\d+),(\d+)\)', cls.openload_level2_debase, txt, re.UNICODE).replace('"+"', '') - - # Openload uses a variant of aadecode - # openload_decode and related functions are originally written by - # vitas@matfyz.cz and released with public domain - # See https://github.com/rg3/youtube-dl/issues/8489 - @classmethod - def openload_decode(cls, txt): - symbol_table = [ - ('_', '(゚Д゚) [゚Θ゚]'), - ('a', '(゚Д゚) [゚ω゚ノ]'), - ('b', '(゚Д゚) [゚Θ゚ノ]'), - ('c', '(゚Д゚) [\'c\']'), - ('d', '(゚Д゚) [゚ー゚ノ]'), - ('e', '(゚Д゚) [゚Д゚ノ]'), - ('f', '(゚Д゚) [1]'), - - ('o', '(゚Д゚) [\'o\']'), - ('u', '(o゚ー゚o)'), - ('c', '(゚Д゚) [\'c\']'), - - ('7', '((゚ー゚) + (o^_^o))'), - ('6', '((o^_^o) +(o^_^o) +(c^_^o))'), - ('5', '((゚ー゚) + (゚Θ゚))'), - ('4', '(-~3)'), - ('3', '(-~-~1)'), - ('2', '(-~1)'), - ('1', '(-~0)'), - ('0', '((c^_^o)-(c^_^o))'), - ] - delim = '(゚Д゚)[゚ε゚]+' - ret = '' - for aachar in txt.split(delim): - for val, pat in symbol_table: - aachar = aachar.replace(pat, val) - aachar = aachar.replace('+ ', '') - m = re.match(r'^\d+', aachar) - if m: - ret += compat_chr(int(m.group(0), 8)) - else: - m = re.match(r'^u([\da-f]+)', aachar) - if m: - ret += compat_chr(int(m.group(1), 16)) - return cls.openload_level2(ret) + def _extract_urls(webpage): + return re.findall( + r']+src=["\']((?:https?://)?(?:openload\.(?:co|io)|oload\.tv)/embed/[a-zA-Z0-9-_]+)', + webpage) def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('https://openload.co/embed/%s/' % video_id, video_id) - if 'File not found' in webpage: + if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True) - code = self._search_regex( - r'\s*\s*]+>[^>]+\s*]+>([^<]+)', - webpage, 'JS code') + ol_id = self._search_regex( + ']+id="[^"]+"[^>]*>([0-9]+)', + webpage, 'openload ID') - decoded = self.openload_decode(code) + first_three_chars = int(float(ol_id[0:][:3])) + fifth_char = int(float(ol_id[3:5])) + urlcode = '' + num = 5 - video_url = self._search_regex( - r'return\s+"(https?://[^"]+)"', decoded, 'video URL') + while num < len(ol_id): + urlcode += compat_chr(int(float(ol_id[num:][:3])) + + first_three_chars - fifth_char * int(float(ol_id[num + 3:][:2]))) + num += 5 + + video_url = 'https://openload.co/stream/' + urlcode title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', fatal=True) - ext = mimetype2ext(self._search_regex( - r'window\.vt\s*=\s*(["\'])(?P.+?)\1', decoded, - 'mimetype', default=None, group='mimetype')) or determine_ext( - video_url, 'mp4') + entries = self._parse_html5_media_entries(url, webpage, video_id) + subtitles = entries[0]['subtitles'] if entries else None - return { + info_dict = { 'id': video_id, 'title': title, - 'ext': ext, 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'url': video_url, + # Seems all videos have extensions in their titles + 'ext': determine_ext(title, 'mp4'), + 'subtitles': subtitles, } + return info_dict diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 6ae30679a..1e2c54e68 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -1,28 +1,28 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re import calendar import datetime from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( HEADRequest, unified_strdate, - ExtractorError, strip_jsonp, int_or_none, float_or_none, determine_ext, remove_end, + unescapeHTML, ) class ORFTVthekIE(InfoExtractor): IE_NAME = 'orf:tvthek' IE_DESC = 'ORF TVthek' - _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics?/.+?|program/[^/]+)/(?P\d+)' + _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P\d+)' _TESTS = [{ 'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389', @@ -51,26 +51,23 @@ class ORFTVthekIE(InfoExtractor): 'skip_download': True, # rtsp downloads }, '_skip': 'Blocked outside of Austria / Germany', + }, { + 'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141', + 'skip_download': True, + }, { + 'url': 'http://tvthek.orf.at/profile/Universum/35429', + 'skip_download': True, }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - data_json = self._search_regex( - r'initializeAdworx\((.+?)\);\n', webpage, 'video info') - all_data = json.loads(data_json) - - def get_segments(all_data): - for data in all_data: - if data['name'] in ( - 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM', - 'Tracker::EPISODE_DETAIL_PAGE_OVER_TOPIC'): - return data['values']['segments'] - - sdata = get_segments(all_data) - if not sdata: - raise ExtractorError('Unable to extract segments') + data_jsb = self._parse_json( + self._search_regex( + r']+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P.+?)\2', + webpage, 'playlist', group='json'), + playlist_id, transform_source=unescapeHTML)['playlist']['videos'] def quality_to_int(s): m = re.search('([0-9]+)', s) @@ -79,8 +76,11 @@ class ORFTVthekIE(InfoExtractor): return int(m.group(1)) entries = [] - for sd in sdata: - video_id = sd['id'] + for sd in data_jsb: + video_id, title = sd.get('id'), sd.get('title') + if not video_id or not title: + continue + video_id = compat_str(video_id) formats = [{ 'preference': -10 if fd['delivery'] == 'hls' else None, 'format_id': '%s-%s-%s' % ( @@ -88,7 +88,7 @@ class ORFTVthekIE(InfoExtractor): 'url': fd['src'], 'protocol': fd['protocol'], 'quality': quality_to_int(fd['quality']), - } for fd in sd['playlist_item_array']['sources']] + } for fd in sd['sources']] # Check for geoblocking. # There is a property is_geoprotection, but that's always false @@ -115,14 +115,24 @@ class ORFTVthekIE(InfoExtractor): self._check_formats(formats, video_id) self._sort_formats(formats) - upload_date = unified_strdate(sd['created_date']) + subtitles = {} + for sub in sd.get('subtitles', []): + sub_src = sub.get('src') + if not sub_src: + continue + subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({ + 'url': sub_src, + }) + + upload_date = unified_strdate(sd.get('created_date')) entries.append({ '_type': 'video', 'id': video_id, - 'title': sd['header'], + 'title': title, 'formats': formats, + 'subtitles': subtitles, 'description': sd.get('description'), - 'duration': int(sd['duration_in_seconds']), + 'duration': int_or_none(sd.get('duration_in_seconds')), 'upload_date': upload_date, 'thumbnail': sd.get('image_full_url'), }) @@ -237,7 +247,7 @@ class ORFIPTVIE(InfoExtractor): 'title': 'Weitere Evakuierungen um Vulkan Calbuco', 'description': 'md5:d689c959bdbcf04efeddedbf2299d633', 'duration': 68.197, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150425', }, } diff --git a/youtube_dl/extractor/pandatv.py b/youtube_dl/extractor/pandatv.py new file mode 100644 index 000000000..133cc9b88 --- /dev/null +++ b/youtube_dl/extractor/pandatv.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + qualities, +) + + +class PandaTVIE(InfoExtractor): + IE_DESC = '熊猫TV' + _VALID_URL = r'http://(?:www\.)?panda\.tv/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.panda.tv/10091', + 'info_dict': { + 'id': '10091', + 'title': 're:.+', + 'uploader': '囚徒', + 'ext': 'flv', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Live stream is offline', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + config = self._download_json( + 'http://www.panda.tv/api_room?roomid=%s' % video_id, video_id) + + error_code = config.get('errno', 0) + if error_code is not 0: + raise ExtractorError( + '%s returned error %s: %s' + % (self.IE_NAME, error_code, config['errmsg']), + expected=True) + + data = config['data'] + video_info = data['videoinfo'] + + # 2 = live, 3 = offline + if video_info.get('status') != '2': + raise ExtractorError( + 'Live stream is offline', expected=True) + + title = data['roominfo']['name'] + uploader = data.get('hostinfo', {}).get('name') + room_key = video_info['room_key'] + stream_addr = video_info.get( + 'stream_addr', {'OD': '1', 'HD': '1', 'SD': '1'}) + + # Reverse engineered from web player swf + # (http://s6.pdim.gs/static/07153e425f581151.swf at the moment of + # writing). + plflag0, plflag1 = video_info['plflag'].split('_') + plflag0 = int(plflag0) - 1 + if plflag1 == '21': + plflag0 = 10 + plflag1 = '4' + live_panda = 'live_panda' if plflag0 < 1 else '' + + quality_key = qualities(['OD', 'HD', 'SD']) + suffix = ['_small', '_mid', ''] + formats = [] + for k, v in stream_addr.items(): + if v != '1': + continue + quality = quality_key(k) + if quality <= 0: + continue + for pref, (ext, pl) in enumerate((('m3u8', '-hls'), ('flv', ''))): + formats.append({ + 'url': 'http://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s' + % (pl, plflag1, room_key, live_panda, suffix[quality], ext), + 'format_id': '%s-%s' % (k, ext), + 'quality': quality, + 'source_preference': pref, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(title), + 'uploader': uploader, + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py index 8d49f5c4a..89c95fffb 100644 --- a/youtube_dl/extractor/pandoratv.py +++ b/youtube_dl/extractor/pandoratv.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -11,6 +11,7 @@ from ..utils import ( float_or_none, parse_duration, str_to_int, + urlencode_postdata, ) @@ -25,7 +26,7 @@ class PandoraTVIE(InfoExtractor): 'ext': 'flv', 'title': '頭を撫でてくれる?', 'description': '頭を撫でてくれる?', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 39, 'upload_date': '20151218', 'uploader': 'カワイイ動物まとめ', @@ -56,6 +57,22 @@ class PandoraTVIE(InfoExtractor): r'^v(\d+)[Uu]rl$', format_id, 'height', default=None) if not height: continue + + play_url = self._download_json( + 'http://m.pandora.tv/?c=api&m=play_url', video_id, + data=urlencode_postdata({ + 'prgid': video_id, + 'runtime': info.get('runtime'), + 'vod_url': format_url, + }), + headers={ + 'Origin': url, + 'Content-Type': 'application/x-www-form-urlencoded', + }) + format_url = play_url.get('url') + if not format_url: + continue + formats.append({ 'format_id': '%sp' % height, 'url': format_url, diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py index 0a423a08f..ebdab8db9 100644 --- a/youtube_dl/extractor/parliamentliveuk.py +++ b/youtube_dl/extractor/parliamentliveuk.py @@ -1,53 +1,43 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class ParliamentLiveUKIE(InfoExtractor): IE_NAME = 'parliamentlive.tv' IE_DESC = 'UK parliament videos' - _VALID_URL = r'https?://www\.parliamentlive\.tv/Main/Player\.aspx\?(?:[^&]+&)*?meetingId=(?P[0-9]+)' + _VALID_URL = r'(?i)https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TEST = { - 'url': 'http://www.parliamentlive.tv/Main/Player.aspx?meetingId=15121&player=windowsmedia', + _TESTS = [{ + 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', 'info_dict': { - 'id': '15121', - 'ext': 'asf', - 'title': 'hoc home affairs committee, 18 mar 2014.pm', - 'description': 'md5:033b3acdf83304cd43946b2d5e5798d1', + 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'ext': 'mp4', + 'title': 'Home Affairs Committee', + 'uploader_id': 'FFMPEG-01', + 'timestamp': 1422696664, + 'upload_date': '20150131', }, - 'params': { - 'skip_download': True, # Requires mplayer (mms) - } - } + }, { + 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - - asx_url = self._html_search_regex( - r'embed.*?src="([^"]+)" name="MediaPlayer"', webpage, - 'metadata URL') - asx = self._download_xml(asx_url, video_id, 'Downloading ASX metadata') - video_url = asx.find('.//REF').attrib['HREF'] - - title = self._search_regex( - r'''(?x)player\.setClipDetails\( - (?:(?:[0-9]+|"[^"]+"),\s*){2} - "([^"]+",\s*"[^"]+)" - ''', - webpage, 'title').replace('", "', ', ') - description = self._html_search_regex( - r'(?s)(.*?)', - webpage, 'description') - + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://vodplayer.parliamentlive.tv/?mid=' + video_id, video_id) + widget_config = self._parse_json(self._search_regex( + r'kWidgetConfig\s*=\s*({.+});', + webpage, 'kaltura widget config'), video_id) + kaltura_url = 'kaltura:%s:%s' % (widget_config['wid'][1:], widget_config['entry_id']) + event_title = self._download_json( + 'http://parliamentlive.tv/Event/GetShareVideo/' + video_id, video_id)['event']['title'] return { + '_type': 'url_transparent', 'id': video_id, - 'ext': 'asf', - 'url': video_url, - 'title': title, - 'description': description, + 'title': event_title, + 'description': '', + 'url': kaltura_url, + 'ie_key': 'Kaltura', } diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 229750665..a6a2c273f 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f6f423597..6baed773f 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError from ..utils import ( ExtractorError, determine_ext, int_or_none, js_to_json, strip_jsonp, + strip_or_none, unified_strdate, US_RATINGS, ) @@ -201,7 +201,7 @@ class PBSIE(InfoExtractor): 'id': '2365006249', 'ext': 'mp4', 'title': 'Constitution USA with Peter Sagal - A More Perfect Union', - 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', + 'description': 'md5:31b664af3c65fd07fa460d306b837d00', 'duration': 3190, }, }, @@ -212,7 +212,7 @@ class PBSIE(InfoExtractor): 'id': '2365297690', 'ext': 'mp4', 'title': 'FRONTLINE - Losing Iraq', - 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', + 'description': 'md5:5979a4d069b157f622d02bff62fbe654', 'duration': 5050, }, }, @@ -223,7 +223,7 @@ class PBSIE(InfoExtractor): 'id': '2201174722', 'ext': 'mp4', 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist', - 'description': 'md5:95a19f568689d09a166dff9edada3301', + 'description': 'md5:86ab9a3d04458b876147b355788b8781', 'duration': 801, }, }, @@ -236,7 +236,7 @@ class PBSIE(InfoExtractor): 'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full', 'description': 'md5:657897370e09e2bc6bf0f8d2cd313c6b', 'duration': 6559, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -249,7 +249,7 @@ class PBSIE(InfoExtractor): 'description': 'md5:c741d14e979fc53228c575894094f157', 'title': 'NOVA - Killer Typhoon', 'duration': 3172, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140122', 'age_limit': 10, }, @@ -268,9 +268,9 @@ class PBSIE(InfoExtractor): 'display_id': 'player', 'ext': 'mp4', 'title': 'American Experience - Death and the Civil War, Chapter 1', - 'description': 'md5:1b80a74e0380ed2a4fb335026de1600d', + 'description': 'md5:67fa89a9402e2ee7d08f53b920674c18', 'duration': 682, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -286,7 +286,7 @@ class PBSIE(InfoExtractor): 'title': 'FRONTLINE - United States of Secrets (Part One)', 'description': 'md5:55756bd5c551519cc4b7703e373e217e', 'duration': 6851, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -294,15 +294,15 @@ class PBSIE(InfoExtractor): # "', webpage): url = self._search_regex( @@ -423,10 +437,10 @@ class PBSIE(InfoExtractor): video_id = mobj.group('id') display_id = video_id - return video_id, display_id, None + return video_id, display_id, None, description def _real_extract(self, url): - video_id, display_id, upload_date = self._extract_webpage(url) + video_id, display_id, upload_date, description = self._extract_webpage(url) if isinstance(video_id, list): entries = [self.url_result( @@ -448,17 +462,6 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) - try: - video_info = self._download_json( - 'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id, - display_id, 'Downloading video info JSON') - extract_redirect_urls(video_info) - info = video_info - except ExtractorError as e: - # videoInfo API may not work for some videos - if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404: - raise - # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -482,7 +485,8 @@ class PBSIE(InfoExtractor): redirect_info = self._download_json( '%s?format=json' % redirect['url'], display_id, - 'Downloading %s video url info' % (redirect_id or num)) + 'Downloading %s video url info' % (redirect_id or num), + headers=self.geo_verification_headers()) if redirect_info['status'] == 'error': raise ExtractorError( @@ -511,15 +515,19 @@ class PBSIE(InfoExtractor): formats)) if http_url: for m3u8_format in m3u8_formats: - bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) - # extract only the formats that we know that they will be available as http format. - # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications - if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'): + bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None) + # Lower qualities (150k and 192k) are not available as HTTP formats (see [1]), + # we won't try extracting them. + # Since summer 2016 higher quality formats (4500k and 6500k) are also available + # albeit they are not documented in [2]. + # 1. https://github.com/rg3/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656 + # 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications + if not bitrate or int(bitrate) < 400: continue - f_url = re.sub(r'\d+k|baseline', bitrate, http_url) + f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) # This may produce invalid links sometimes (e.g. # http://www.pbs.org/wgbh/frontline/film/suicide-plan) - if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate): + if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate): continue f = m3u8_format.copy() f.update({ @@ -560,13 +568,16 @@ class PBSIE(InfoExtractor): # Try turning it to 'program - title' naming scheme if possible alt_title = info.get('program', {}).get('title') if alt_title: - info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title']) + info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + r'[\s\-:]+', '', info['title']) + + description = info.get('description') or info.get( + 'program', {}).get('description') or description return { 'id': video_id, 'display_id': display_id, 'title': info['title'], - 'description': info.get('description') or info.get('program', {}).get('description'), + 'description': description, 'thumbnail': info.get('image_url'), 'duration': int_or_none(info.get('duration')), 'age_limit': age_limit, diff --git a/youtube_dl/extractor/people.py b/youtube_dl/extractor/people.py index 9ecdbc13b..6ca95715e 100644 --- a/youtube_dl/extractor/people.py +++ b/youtube_dl/extractor/people.py @@ -14,7 +14,7 @@ class PeopleIE(InfoExtractor): 'ext': 'mp4', 'title': 'Astronaut Love Triangle Victim Speaks Out: “The Crime in 2007 Hasn’t Defined Us”', 'description': 'Colleen Shipman speaks to PEOPLE for the first time about life after the attack', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 246.318, 'timestamp': 1458720585, 'upload_date': '20160323', diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 75f5884a9..0e3623024 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_iso8601, @@ -8,7 +10,14 @@ from ..utils import ( ) -class PeriscopeIE(InfoExtractor): +class PeriscopeBaseIE(InfoExtractor): + def _call_api(self, method, query, item_id): + return self._download_json( + 'https://api.periscope.tv/api/v2/%s' % method, + item_id, query=query) + + +class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P[^/?#]+)' @@ -34,14 +43,18 @@ class PeriscopeIE(InfoExtractor): 'only_matching': True, }] - def _call_api(self, method, value): - return self._download_json( - 'https://api.periscope.tv/api/v2/%s?broadcast_id=%s' % (method, value), value) + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+src=([\'"])(?P(?:https?:)?//(?:www\.)?periscope\.tv/(?:(?!\1).)+)\1', webpage) + if mobj: + return mobj.group('url') def _real_extract(self, url): token = self._match_id(url) - broadcast_data = self._call_api('getBroadcastPublic', token) + broadcast_data = self._call_api( + 'getBroadcastPublic', {'broadcast_id': token}, token) broadcast = broadcast_data['broadcast'] status = broadcast['status'] @@ -61,7 +74,8 @@ class PeriscopeIE(InfoExtractor): 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] - stream = self._call_api('getAccessPublic', token) + stream = self._call_api( + 'getAccessPublic', {'broadcast_id': token}, token) formats = [] for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): @@ -73,7 +87,7 @@ class PeriscopeIE(InfoExtractor): 'ext': 'flv' if format_id == 'rtmp' else 'mp4', } if format_id != 'rtmp': - f['protocol'] = 'm3u8_native' if state == 'ended' else 'm3u8' + f['protocol'] = 'm3u8_native' if state in ('ended', 'timed_out') else 'm3u8' formats.append(f) self._sort_formats(formats) @@ -88,8 +102,8 @@ class PeriscopeIE(InfoExtractor): } -class PeriscopeUserIE(InfoExtractor): - _VALID_URL = r'https?://www\.periscope\.tv/(?P[^/]+)/?$' +class PeriscopeUserIE(PeriscopeBaseIE): + _VALID_URL = r'https?://(?:www\.)?periscope\.tv/(?P[^/]+)/?$' IE_DESC = 'Periscope user videos' IE_NAME = 'periscope:user' @@ -106,26 +120,34 @@ class PeriscopeUserIE(InfoExtractor): } def _real_extract(self, url): - user_id = self._match_id(url) + user_name = self._match_id(url) - webpage = self._download_webpage(url, user_id) + webpage = self._download_webpage(url, user_name) data_store = self._parse_json( unescapeHTML(self._search_regex( r'data-store=(["\'])(?P.+?)\1', webpage, 'data store', default='{}', group='data')), - user_id) + user_name) - user = data_store.get('User', {}).get('user', {}) - title = user.get('display_name') or user.get('username') + user = list(data_store['UserCache']['users'].values())[0]['user'] + user_id = user['id'] + session_id = data_store['SessionToken']['public']['broadcastHistory']['token']['session_id'] + + broadcasts = self._call_api( + 'getUserBroadcastsPublic', + {'user_id': user_id, 'session_id': session_id}, + user_name)['broadcasts'] + + broadcast_ids = [ + broadcast['id'] for broadcast in broadcasts if broadcast.get('id')] + + title = user.get('display_name') or user.get('username') or user_name description = user.get('description') - broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or - data_store.get('BroadcastCache', {}).get('broadcastIds', [])) - entries = [ self.url_result( - 'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id)) + 'https://www.periscope.tv/%s/%s' % (user_name, broadcast_id)) for broadcast_id in broadcast_ids] return self.playlist_result(entries, user_id, title, description) diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index ac009f60f..e435c28e1 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,9 +1,9 @@ from __future__ import unicode_literals -from .zdf import ZDFIE +from .dreisat import DreiSatIE -class PhoenixIE(ZDFIE): +class PhoenixIE(DreiSatIE): IE_NAME = 'phoenix.de' _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ (?: diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py new file mode 100644 index 000000000..d44edcdfb --- /dev/null +++ b/youtube_dl/extractor/piksel.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + dict_get, + int_or_none, + unescapeHTML, + parse_iso8601, +) + + +class PikselIE(InfoExtractor): + _VALID_URL = r'https?://player\.piksel\.com/v/(?P[a-z0-9]+)' + _TEST = { + 'url': 'http://player.piksel.com/v/nv60p12f', + 'md5': 'd9c17bbe9c3386344f9cfd32fad8d235', + 'info_dict': { + 'id': 'nv60p12f', + 'ext': 'mp4', + 'title': 'فن الحياة - الحلقة 1', + 'description': 'احدث برامج الداعية الاسلامي " مصطفي حسني " فى رمضان 2016علي النهار نور', + 'timestamp': 1465231790, + 'upload_date': '20160606', + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+src=["\'](?P(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + app_token = self._search_regex( + r'clientAPI\s*:\s*"([^"]+)"', webpage, 'app token') + response = self._download_json( + 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, + video_id, query={ + 'v': video_id + })['response'] + failure = response.get('failure') + if failure: + raise ExtractorError(response['failure']['reason'], expected=True) + video_data = response['WsProgramResponse']['program']['asset'] + title = video_data['title'] + + formats = [] + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + asset_type = dict_get(video_data, ['assetType', 'asset_type']) + for asset_file in video_data.get('assetFiles', []): + # TODO: extract rtmp formats + http_url = asset_file.get('http_url') + if not http_url: + continue + tbr = None + vbr = int_or_none(asset_file.get('videoBitrate'), 1024) + abr = int_or_none(asset_file.get('audioBitrate'), 1024) + if asset_type == 'video': + tbr = vbr + abr + elif asset_type == 'audio': + tbr = abr + + format_id = ['http'] + if tbr: + format_id.append(compat_str(tbr)) + + formats.append({ + 'format_id': '-'.join(format_id), + 'url': unescapeHTML(http_url), + 'vbr': vbr, + 'abr': abr, + 'width': int_or_none(asset_file.get('videoWidth')), + 'height': int_or_none(asset_file.get('videoHeight')), + 'filesize': int_or_none(asset_file.get('filesize')), + 'tbr': tbr, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailUrl'), + 'timestamp': parse_iso8601(video_data.get('dateadd')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py index a52210fab..6a4580d54 100644 --- a/youtube_dl/extractor/pinkbike.py +++ b/youtube_dl/extractor/pinkbike.py @@ -23,7 +23,7 @@ class PinkbikeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Brandon Semenuk - RAW 100', 'description': 'Official release: www.redbull.ca/rupertwalker', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 100, 'upload_date': '20150406', 'uploader': 'revelco', diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index 77e1211d6..e38c7618e 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -34,7 +34,7 @@ class PladformIE(InfoExtractor): 'ext': 'mp4', 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 694, 'age_limit': 0, }, diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py deleted file mode 100644 index 57c875ef0..000000000 --- a/youtube_dl/extractor/played.py +++ /dev/null @@ -1,60 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import os.path - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, -) - - -class PlayedIE(InfoExtractor): - IE_NAME = 'played.to' - _VALID_URL = r'https?://(?:www\.)?played\.to/(?P[a-zA-Z0-9_-]+)' - - _TEST = { - 'url': 'http://played.to/j2f2sfiiukgt', - 'md5': 'c2bd75a368e82980e7257bf500c00637', - 'info_dict': { - 'id': 'j2f2sfiiukgt', - 'ext': 'flv', - 'title': 'youtube-dl_test_video.mp4', - }, - 'skip': 'Removed for copyright infringement.', # oh wow - } - - def _real_extract(self, url): - video_id = self._match_id(url) - orig_webpage = self._download_webpage(url, video_id) - - m_error = re.search( - r'(?s)Reason for deletion:.*?]*>(?P[^<]+)', orig_webpage) - if m_error: - raise ExtractorError(m_error.group('msg'), expected=True) - - data = self._hidden_inputs(orig_webpage) - - self._sleep(2, video_id) - - post = urlencode_postdata(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = sanitized_Request(url, post, headers) - webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') - - title = os.path.splitext(data['fname'])[0] - - video_url = self._search_regex( - r'file: "?(.+?)",', webpage, 'video URL') - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - } diff --git a/youtube_dl/extractor/plays.py b/youtube_dl/extractor/plays.py index c3c38cf4a..ddfc6f148 100644 --- a/youtube_dl/extractor/plays.py +++ b/youtube_dl/extractor/plays.py @@ -8,30 +8,31 @@ from ..utils import int_or_none class PlaysTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?plays\.tv/video/(?P[0-9a-f]{18})' - _TEST = { - 'url': 'http://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', + _VALID_URL = r'https?://(?:www\.)?plays\.tv/(?:video|embeds)/(?P[0-9a-f]{18})' + _TESTS = [{ + 'url': 'https://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', 'md5': 'dfeac1198506652b5257a62762cec7bc', 'info_dict': { 'id': '56af17f56c95335490', 'ext': 'mp4', - 'title': 'When you outplay the Azir wall', + 'title': 'Bjergsen - When you outplay the Azir wall', 'description': 'Posted by Bjergsen', } - } + }, { + 'url': 'https://plays.tv/embeds/56af17f56c95335490', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'https://plays.tv/video/%s' % video_id, video_id) + + info = self._search_json_ld(webpage, video_id,) - title = self._og_search_title(webpage) - content = self._parse_json( - self._search_regex( - r'R\.bindContent\(({.+?})\);', webpage, - 'content'), video_id)['content'] mpd_url, sources = re.search( r'(?s)]+data-mpd="([^"]+)"[^>]*>(.+?)', - content).groups() + webpage).groups() formats = self._extract_mpd_formats( self._proto_relative_url(mpd_url), video_id, mpd_id='DASH') for format_id, height, format_url in re.findall(r'.+?)(?:#|$)' + _VALID_URL = r'https?://(?:www\.)?playvid\.com/watch(\?v=|/)(?P.+?)(?:#|$)' _TESTS = [{ 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', @@ -34,7 +34,7 @@ class PlayvidIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park', 'age_limit': 18, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }] diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py index 0bc743118..4d96a10a7 100644 --- a/youtube_dl/extractor/playwire.py +++ b/youtube_dl/extractor/playwire.py @@ -18,7 +18,7 @@ class PlaywireIE(InfoExtractor): 'id': '3353705', 'ext': 'mp4', 'title': 'S04_RM_UCL_Rus', - 'thumbnail': 're:^https?://.*\.png$', + 'thumbnail': r're:^https?://.*\.png$', 'duration': 145.94, }, }, { diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 9aab77645..5c798e874 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -1,9 +1,9 @@ from __future__ import unicode_literals -import re -import json -import random import collections +import json +import os +import random from .common import InfoExtractor from ..compat import ( @@ -11,22 +11,24 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + dict_get, ExtractorError, + float_or_none, int_or_none, parse_duration, qualities, - sanitized_Request, + srt_subtitles_timecode, urlencode_postdata, ) class PluralsightBaseIE(InfoExtractor): - _API_BASE = 'http://app.pluralsight.com' + _API_BASE = 'https://app.pluralsight.com' class PluralsightIE(PluralsightBaseIE): IE_NAME = 'pluralsight' - _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/training/player\?' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' _LOGIN_URL = 'https://app.pluralsight.com/id/' _NETRC_MACHINE = 'pluralsight' @@ -48,6 +50,9 @@ class PluralsightIE(PluralsightBaseIE): # available without pluralsight account 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', + 'only_matching': True, }] def _real_initialize(self): @@ -75,12 +80,10 @@ class PluralsightIE(PluralsightBaseIE): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) error = self._search_regex( r']+class="field-validation-error"[^>]*>([^<]+)', @@ -91,34 +94,75 @@ class PluralsightIE(PluralsightBaseIE): if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')): raise ExtractorError('Unable to log in') + def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): + captions_post = { + 'a': author, + 'cn': clip_id, + 'lc': lang, + 'm': name, + } + captions = self._download_json( + '%s/player/retrieve-captions' % self._API_BASE, video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False, data=json.dumps(captions_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + if captions: + return { + lang: [{ + 'ext': 'json', + 'data': json.dumps(captions), + }, { + 'ext': 'srt', + 'data': self._convert_subtitles(duration, captions), + }] + } + + @staticmethod + def _convert_subtitles(duration, subs): + srt = '' + TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') + TEXT_KEYS = ('text', 'Text') + for num, current in enumerate(subs): + current = subs[num] + start, text = ( + float_or_none(dict_get(current, TIME_OFFSET_KEYS)), + dict_get(current, TEXT_KEYS)) + if start is None or text is None: + continue + end = duration if num == len(subs) - 1 else float_or_none( + dict_get(subs[num + 1], TIME_OFFSET_KEYS)) + if end is None: + continue + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + return srt + def _real_extract(self, url): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) author = qs.get('author', [None])[0] name = qs.get('name', [None])[0] clip_id = qs.get('clip', [None])[0] - course = qs.get('course', [None])[0] + course_name = qs.get('course', [None])[0] - if any(not f for f in (author, name, clip_id, course,)): + if any(not f for f in (author, name, clip_id, course_name,)): raise ExtractorError('Invalid URL', expected=True) display_id = '%s-%s' % (name, clip_id) - webpage = self._download_webpage(url, display_id) + course = self._download_json( + 'https://app.pluralsight.com/player/user/api/v1/player/payload', + display_id, data=urlencode_postdata({'courseId': course_name}), + headers={'Referer': url}) - modules = self._search_regex( - r'moduleCollection\s*:\s*new\s+ModuleCollection\((\[.+?\])\s*,\s*\$rootScope\)', - webpage, 'modules', default=None) - - if modules: - collection = self._parse_json(modules, display_id) - else: - # Webpage may be served in different layout (see - # https://github.com/rg3/youtube-dl/issues/7607) - collection = self._parse_json( - self._search_regex( - r'var\s+initialState\s*=\s*({.+?});\n', webpage, 'initial state'), - display_id)['course']['modules'] + collection = course['modules'] module, clip = None, None @@ -138,6 +182,8 @@ class PluralsightIE(PluralsightBaseIE): if not clip: raise ExtractorError('Unable to resolve clip') + title = '%s - %s' % (module['title'], clip['title']) + QUALITIES = { 'low': {'width': 640, 'height': 480}, 'medium': {'width': 848, 'height': 640}, @@ -157,8 +203,7 @@ class PluralsightIE(PluralsightBaseIE): # Some courses also offer widescreen resolution for high quality (see # https://github.com/rg3/youtube-dl/issues/7766) - widescreen = True if re.search( - r'courseSupportsWidescreenVideoFormats\s*:\s*true', webpage) else False + widescreen = course.get('supportsWideScreenVideoFormats') is True best_quality = 'high-widescreen' if widescreen else 'high' if widescreen: for allowed_quality in ALLOWED_QUALITIES: @@ -187,22 +232,21 @@ class PluralsightIE(PluralsightBaseIE): for quality in qualities_: f = QUALITIES[quality].copy() clip_post = { - 'a': author, - 'cap': 'false', - 'cn': clip_id, - 'course': course, - 'lc': 'en', - 'm': name, - 'mt': ext, - 'q': '%dx%d' % (f['width'], f['height']), + 'author': author, + 'includeCaptions': False, + 'clipIndex': int(clip_id), + 'courseName': course_name, + 'locale': 'en', + 'moduleName': name, + 'mediaType': ext, + 'quality': '%dx%d' % (f['width'], f['height']), } - request = sanitized_Request( - '%s/training/Player/ViewClip' % self._API_BASE, - json.dumps(clip_post).encode('utf-8')) - request.add_header('Content-Type', 'application/json;charset=utf-8') format_id = '%s-%s' % (ext, quality) - clip_url = self._download_webpage( - request, display_id, 'Downloading %s URL' % format_id, fatal=False) + viewclip = self._download_json( + '%s/video/clips/viewclip' % self._API_BASE, display_id, + 'Downloading %s viewclip JSON' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) # Pluralsight tracks multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see @@ -214,29 +258,44 @@ class PluralsightIE(PluralsightBaseIE): random.randint(2, 5), display_id, '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') - if not clip_url: + if not viewclip: continue - f.update({ - 'url': clip_url, - 'ext': ext, - 'format_id': format_id, - 'quality': quality_key(quality), - }) - formats.append(f) + + clip_urls = viewclip.get('urls') + if not isinstance(clip_urls, list): + continue + + for clip_url_data in clip_urls: + clip_url = clip_url_data.get('url') + if not clip_url: + continue + cdn = clip_url_data.get('cdn') + clip_f = f.copy() + clip_f.update({ + 'url': clip_url, + 'ext': ext, + 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, + 'quality': quality_key(quality), + 'source_preference': int_or_none(clip_url_data.get('rank')), + }) + formats.append(clip_f) + self._sort_formats(formats) - # TODO: captions - # http://www.pluralsight.com/training/Player/ViewClip + cap = true - # or - # http://www.pluralsight.com/training/Player/Captions - # { a = author, cn = clip_id, lc = end, m = name } + duration = int_or_none( + clip.get('duration')) or parse_duration(clip.get('formattedDuration')) + + # TODO: other languages? + subtitles = self.extract_subtitles( + author, clip_id, 'en', name, duration, display_id) return { 'id': clip.get('clipName') or clip['name'], - 'title': '%s - %s' % (module['title'], clip['title']), - 'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')), + 'title': title, + 'duration': duration, 'creator': author, - 'formats': formats + 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index f559b899f..2ac1fcb0b 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -1,14 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse_unquote, + compat_urlparse ) from ..utils import ( + extract_attributes, int_or_none, strip_or_none, unified_timestamp, @@ -33,7 +36,7 @@ class PolskieRadioIE(InfoExtractor): 'timestamp': 1456594200, 'upload_date': '20160227', 'duration': 2364, - 'thumbnail': 're:^https?://static\.prsa\.pl/images/.*\.jpg$' + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], }, { @@ -97,3 +100,81 @@ class PolskieRadioIE(InfoExtractor): description = strip_or_none(self._og_search_description(webpage)) return self.playlist_result(entries, playlist_id, title, description) + + +class PolskieRadioCategoryIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', + 'info_dict': { + 'id': '5102', + 'title': 'HISTORIA ŻYWA', + }, + 'playlist_mincount': 38, + }, { + 'url': 'http://www.polskieradio.pl/7/4807', + 'info_dict': { + 'id': '4807', + 'title': 'Vademecum 1050. rocznicy Chrztu Polski' + }, + 'playlist_mincount': 5 + }, { + 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', + 'only_matching': True + }, { + 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', + 'info_dict': { + 'id': '4143', + 'title': 'Kierunek Kraków', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', + 'info_dict': { + 'id': '214', + 'title': 'Muzyka', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) + + def _entries(self, url, page, category_id): + content = page + for page_num in itertools.count(2): + for a_entry, entry_id in re.findall( + r'(?s)]+>.*?(]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?', + content): + entry = extract_attributes(a_entry) + href = entry.get('href') + if not href: + continue + yield self.url_result( + compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), + entry_id, entry.get('title')) + mobj = re.search( + r']+class=["\']next["\'][^>]*>\s*]+href=(["\'])(?P(?:(?!\1).)+)\1', + content) + if not mobj: + break + next_url = compat_urlparse.urljoin(url, mobj.group('url')) + content = self._download_webpage( + next_url, category_id, 'Downloading page %s' % page_num) + + def _real_extract(self, url): + category_id = self._match_id(url) + webpage = self._download_webpage(url, category_id) + title = self._html_search_regex( + r'([^<]+) - [^<]+ - [^<]+', + webpage, 'title', fatal=False) + return self.playlist_result( + self._entries(url, webpage, category_id), + category_id, title) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 9894f3262..073fc3e21 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from ..compat import ( diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py new file mode 100644 index 000000000..8218c7d3b --- /dev/null +++ b/youtube_dl/extractor/porncom.py @@ -0,0 +1,100 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + parse_filesize, + str_to_int, +) + + +class PornComIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P[^/]+)-)?(?P\d+)' + _TESTS = [{ + 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', + 'md5': '3f30ce76267533cd12ba999263156de7', + 'info_dict': { + 'id': '2603339', + 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', + 'ext': 'mp4', + 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 551, + 'view_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + config = self._parse_json( + self._search_regex( + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', + webpage, 'config', default='{}'), + display_id, transform_source=js_to_json, fatal=False) + + if config: + title = config['title'] + formats = [{ + 'url': stream['url'], + 'format_id': stream.get('id'), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) + } for stream in config['streams'] if stream.get('url')] + thumbnail = (compat_urlparse.urljoin( + config['thumbCDN'], config['poster']) + if config.get('thumbCDN') and config.get('poster') else None) + duration = int_or_none(config.get('length')) + else: + title = self._search_regex( + (r'([^<]+)', r']*>([^<]+)'), + webpage, 'title') + formats = [{ + 'url': compat_urlparse.urljoin(url, format_url), + 'format_id': '%sp' % height, + 'height': int(height), + 'filesize_approx': parse_filesize(filesize), + } for format_url, height, filesize in re.findall( + r']+href="(/download/[^"]+)">MPEG4 (\d+)p]*>(\d+\s+[a-zA-Z]+)<', + webpage)] + thumbnail = None + duration = None + + self._sort_formats(formats) + + view_count = str_to_int(self._search_regex( + r'class=["\']views["\'][^>]*>

([\d,.]+)', webpage, + 'view count', fatal=False)) + + def extract_list(kind): + s = self._search_regex( + r'(?s)]*>%s:(.+?)

' % kind.capitalize(), + webpage, kind, fatal=False) + return re.findall(r']+>([^<]+)', s or '') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, + 'categories': extract_list('categories'), + 'tags': extract_list('tags'), + } diff --git a/youtube_dl/extractor/pornflip.py b/youtube_dl/extractor/pornflip.py new file mode 100644 index 000000000..a4a5d390e --- /dev/null +++ b/youtube_dl/extractor/pornflip.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, +) +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) + + +class PornFlipIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P[0-9A-Za-z]{11})' + _TESTS = [{ + 'url': 'https://www.pornflip.com/v/wz7DfNhMmep', + 'md5': '98c46639849145ae1fd77af532a9278c', + 'info_dict': { + 'id': 'wz7DfNhMmep', + 'ext': 'mp4', + 'title': '2 Amateurs swallow make his dream cumshots true', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 112, + 'timestamp': 1481655502, + 'upload_date': '20161213', + 'uploader_id': '106786', + 'uploader': 'figifoto', + 'view_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://www.pornflip.com/embed/wz7DfNhMmep', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.pornflip.com/v/%s' % video_id, video_id) + + flashvars = compat_parse_qs(self._search_regex( + r']+flashvars=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'flashvars', group='flashvars')) + + title = flashvars['video_vars[title]'][0] + + def flashvar(kind): + return try_get( + flashvars, lambda x: x['video_vars[%s]' % kind][0], compat_str) + + formats = [] + for key, value in flashvars.items(): + if not (value and isinstance(value, list)): + continue + format_url = value[0] + if key == 'video_vars[hds_manifest]': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + continue + height = self._search_regex( + r'video_vars\[video_urls\]\[(\d+)', key, 'height', default=None) + if not height: + continue + formats.append({ + 'url': format_url, + 'format_id': 'http-%s' % height, + 'height': int_or_none(height), + }) + self._sort_formats(formats) + + uploader = self._html_search_regex( + (r']+class="name"[^>]*>\s*]+>\s*(?P[^<]+)', + r']+content=(["\'])[^>]*\buploaded by (?P.+?)\1'), + webpage, 'uploader', fatal=False, group='uploader') + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'thumbnail': flashvar('big_thumb'), + 'duration': int_or_none(flashvar('duration')), + 'timestamp': unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')), + 'uploader_id': flashvar('author_id'), + 'uploader': uploader, + 'view_count': int_or_none(flashvar('views')), + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 8df12eec0..842317e6c 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -21,7 +21,7 @@ class PornHdIE(InfoExtractor): 'ext': 'mp4', 'title': 'Restroom selfie masturbation', 'description': 'md5:3748420395e03e31ac96857a8f125b2b', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, 'age_limit': 18, } @@ -35,7 +35,7 @@ class PornHdIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sierra loves doing laundry', 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, 'age_limit': 18, }, diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 20976c101..3eaf56973 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -15,6 +15,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + js_to_json, orderedSet, sanitized_Request, str_to_int, @@ -32,7 +33,7 @@ class PornHubIE(InfoExtractor): (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) - (?P[0-9a-z]+) + (?P[\da-z]+) ''' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', @@ -48,6 +49,8 @@ class PornHubIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'age_limit': 18, + 'tags': list, + 'categories': list, }, }, { # non-ASCII title @@ -63,6 +66,8 @@ class PornHubIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'age_limit': 18, + 'tags': list, + 'categories': list, }, 'params': { 'skip_download': True, @@ -91,12 +96,11 @@ class PornHubIE(InfoExtractor): 'only_matching': True, }] - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage) - if mobj: - return mobj.group('url') + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)', + webpage) def _extract_count(self, pattern, webpage, name): return str_to_int(self._search_regex( @@ -183,6 +187,15 @@ class PornHubIE(InfoExtractor): }) self._sort_formats(formats) + page_params = self._parse_json(self._search_regex( + r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P{[^}]+})', + webpage, 'page parameters', group='data', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + tags = categories = None + if page_params: + tags = page_params.get('tags', '').split(',') + categories = page_params.get('categories', '').split(',') + return { 'id': video_id, 'uploader': video_uploader, @@ -195,6 +208,8 @@ class PornHubIE(InfoExtractor): 'comment_count': comment_count, 'formats': formats, 'age_limit': 18, + 'tags': tags, + 'categories': categories, } @@ -214,7 +229,14 @@ class PornHubPlaylistBaseIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - entries = self._extract_entries(webpage) + # Only process container div with main playlist content skipping + # drop-down menu that uses similar pattern for videos (see + # https://github.com/rg3/youtube-dl/issues/11594). + container = self._search_regex( + r'(?s)(]+class=["\']container.+)', webpage, + 'container', default=webpage) + + entries = self._extract_entries(container) playlist = self._parse_json( self._search_regex( @@ -228,12 +250,12 @@ class PornHubPlaylistBaseIE(InfoExtractor): class PornHubPlaylistIE(PornHubPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P\d+)' _TESTS = [{ - 'url': 'http://www.pornhub.com/playlist/6201671', + 'url': 'http://www.pornhub.com/playlist/4667351', 'info_dict': { - 'id': '6201671', - 'title': 'P0p4', + 'id': '4667351', + 'title': 'Nataly Hot', }, - 'playlist_mincount': 35, + 'playlist_mincount': 2, }] diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py index 5398e708b..1b5b9a320 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/youtube_dl/extractor/pornotube.py @@ -3,10 +3,7 @@ from __future__ import unicode_literals import json from .common import InfoExtractor -from ..utils import ( - int_or_none, - sanitized_Request, -) +from ..utils import int_or_none class PornotubeIE(InfoExtractor): @@ -22,7 +19,7 @@ class PornotubeIE(InfoExtractor): 'description': 'md5:a8304bef7ef06cb4ab476ca6029b01b0', 'categories': ['Adult Humor', 'Blondes'], 'uploader': 'Alpha Blue Archives', - 'thumbnail': 're:^https?://.*\\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1417582800, 'age_limit': 18, } @@ -31,59 +28,55 @@ class PornotubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - # Fetch origin token - js_config = self._download_webpage( - 'http://www.pornotube.com/assets/src/app/config.js', video_id, - note='Download JS config') - originAuthenticationSpaceKey = self._search_regex( - r"constant\('originAuthenticationSpaceKey',\s*'([^']+)'", - js_config, 'originAuthenticationSpaceKey') + token = self._download_json( + 'https://api.aebn.net/auth/v2/origins/authenticate', + video_id, note='Downloading token', + data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'http://www.pornotube.com', + })['tokenKey'] - # Fetch actual token - token_req_data = { - 'authenticationSpaceKey': originAuthenticationSpaceKey, - 'credentials': 'Clip Application', - } - token_req = sanitized_Request( - 'https://api.aebn.net/auth/v1/token/primal', - data=json.dumps(token_req_data).encode('utf-8')) - token_req.add_header('Content-Type', 'application/json') - token_req.add_header('Origin', 'http://www.pornotube.com') - token_answer = self._download_json( - token_req, video_id, note='Requesting primal token') - token = token_answer['tokenKey'] + video_url = self._download_json( + 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id, + video_id, note='Downloading delivery information', + headers={'Authorization': token})['mediaUrl'] - # Get video URL - delivery_req = sanitized_Request( - 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id) - delivery_req.add_header('Authorization', token) - delivery_info = self._download_json( - delivery_req, video_id, note='Downloading delivery information') - video_url = delivery_info['mediaUrl'] + FIELDS = ( + 'title', 'description', 'startSecond', 'endSecond', 'publishDate', + 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber' + ) - # Get additional info (title etc.) - info_req = sanitized_Request( - 'https://api.aebn.net/content/v1/clips/%s?expand=' - 'title,description,primaryImageNumber,startSecond,endSecond,' - 'movie.title,movie.MovieId,movie.boxCoverFront,movie.stars,' - 'movie.studios,stars.name,studios.name,categories.name,' - 'clipActive,movieActive,publishDate,orientations' % video_id) - info_req.add_header('Authorization', token) info = self._download_json( - info_req, video_id, note='Downloading metadata') + 'https://api.aebn.net/content/v2/clips/%s?fields=%s' + % (video_id, ','.join(FIELDS)), video_id, + note='Downloading metadata', + headers={'Authorization': token}) + + if isinstance(info, list): + info = info[0] + + title = info['title'] timestamp = int_or_none(info.get('publishDate'), scale=1000) uploader = info.get('studios', [{}])[0].get('name') - movie_id = info['movie']['movieId'] - thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( - movie_id, movie_id, info['primaryImageNumber']) - categories = [c['name'] for c in info.get('categories')] + movie_id = info.get('movieId') + primary_image_number = info.get('primaryImageNumber') + thumbnail = None + if movie_id and primary_image_number: + thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( + movie_id, movie_id, primary_image_number) + start = int_or_none(info.get('startSecond')) + end = int_or_none(info.get('endSecond')) + duration = end - start if start and end else None + categories = [c['name'] for c in info.get('categories', []) if c.get('name')] return { 'id': video_id, 'url': video_url, - 'title': info['title'], + 'title': title, 'description': info.get('description'), + 'duration': duration, 'timestamp': timestamp, 'uploader': uploader, 'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py index 6b51e5c54..b6b71069d 100644 --- a/youtube_dl/extractor/pornovoisines.py +++ b/youtube_dl/extractor/pornovoisines.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import random from .common import InfoExtractor from ..utils import ( @@ -13,61 +12,69 @@ from ..utils import ( class PornoVoisinesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/showvideo/(?P\d+)/(?P[^/]+)' - - _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ - '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' - - _SERVER_NUMBERS = (1, 2) + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P\d+)/(?P[^/.]+)' _TEST = { - 'url': 'http://www.pornovoisines.com/showvideo/1285/recherche-appartement/', - 'md5': '5ac670803bc12e9e7f9f662ce64cf1d1', + 'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html', + 'md5': '6f8aca6a058592ab49fe701c8ba8317b', 'info_dict': { - 'id': '1285', + 'id': '919', 'display_id': 'recherche-appartement', 'ext': 'mp4', 'title': 'Recherche appartement', - 'description': 'md5:819ea0b785e2a04667a1a01cdc89594e', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140925', 'duration': 120, 'view_count': int, 'average_rating': float, - 'categories': ['Débutantes', 'Scénario', 'Sodomie'], + 'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'], 'age_limit': 18, + 'subtitles': { + 'fr': [{ + 'ext': 'vtt', + }] + }, } } - @classmethod - def build_video_url(cls, num): - return cls._VIDEO_URL_TEMPLATE % (random.choice(cls._SERVER_NUMBERS), num) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') + settings_url = self._download_json( + 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, + video_id, note='Getting settings URL')['video_settings_url'] + settings = self._download_json(settings_url, video_id)['data'] + + formats = [] + for kind, data in settings['variants'].items(): + if kind == 'HLS': + formats.extend(self._extract_m3u8_formats( + data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls')) + elif kind == 'MP4': + for item in data: + formats.append({ + 'url': item['url'], + 'height': item.get('height'), + 'bitrate': item.get('bitrate'), + }) + self._sort_formats(formats) + webpage = self._download_webpage(url, video_id) - video_url = self.build_video_url(video_id) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) - title = self._html_search_regex( - r'

(.+?)

', webpage, 'title', flags=re.DOTALL) - description = self._html_search_regex( - r'
(.+?)
', - webpage, 'description', fatal=False, flags=re.DOTALL) - - thumbnail = self._search_regex( - r'
\s*]+class=([\'"])thumb\1[^>]*src=([\'"])(?P[^"]+)\2', + webpage, 'thumbnail', fatal=False, group='url') upload_date = unified_strdate(self._search_regex( - r'Publié le ([\d-]+)', webpage, 'upload date', fatal=False)) - duration = int_or_none(self._search_regex( - 'Durée (\d+)', webpage, 'duration', fatal=False)) + r'Le\s*([\d/]+)', webpage, 'upload date', fatal=False)) + duration = settings.get('main', {}).get('duration') view_count = int_or_none(self._search_regex( r'(\d+) vues', webpage, 'view count', fatal=False)) average_rating = self._search_regex( @@ -75,15 +82,19 @@ class PornoVoisinesIE(InfoExtractor): if average_rating: average_rating = float_or_none(average_rating.replace(',', '.')) - categories = self._html_search_meta( - 'keywords', webpage, 'categories', fatal=False) + categories = self._html_search_regex( + r'(?s)Catégories\s*:\s*(.+?)', webpage, 'categories', fatal=False) if categories: categories = [category.strip() for category in categories.split(',')] + subtitles = {'fr': [{ + 'url': subtitle, + } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]} + return { 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'formats': formats, 'title': title, 'description': description, 'thumbnail': thumbnail, @@ -93,4 +104,5 @@ class PornoVoisinesIE(InfoExtractor): 'average_rating': average_rating, 'categories': categories, 'age_limit': 18, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/pornoxo.py b/youtube_dl/extractor/pornoxo.py index 202f58673..1a0cce7e0 100644 --- a/youtube_dl/extractor/pornoxo.py +++ b/youtube_dl/extractor/pornoxo.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .jwplatform import JWPlatformBaseIE from ..utils import ( str_to_int, ) -class PornoXOIE(InfoExtractor): +class PornoXOIE(JWPlatformBaseIE): _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P\d+)/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', @@ -17,32 +17,24 @@ class PornoXOIE(InfoExtractor): 'id': '7564', 'ext': 'flv', 'title': 'Striptease From Sexy Secretary!', - 'description': 'Striptease From Sexy Secretary!', + 'display_id': 'striptease-from-sexy-secretary', + 'description': 'md5:0ee35252b685b3883f4a1d38332f9980', 'categories': list, # NSFW - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id, display_id = mobj.groups() webpage = self._download_webpage(url, video_id) - - video_url = self._html_search_regex( - r'\'file\'\s*:\s*"([^"]+)"', webpage, 'video_url') + video_data = self._extract_jwplayer_data(webpage, video_id, require_title=False) title = self._html_search_regex( r'([^<]+)\s*-\s*PornoXO', webpage, 'title') - description = self._html_search_regex( - r'<meta name="description" content="([^"]+)\s*featuring', - webpage, 'description', fatal=False) - - thumbnail = self._html_search_regex( - r'\'image\'\s*:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) - view_count = str_to_int(self._html_search_regex( r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False)) @@ -53,13 +45,14 @@ class PornoXOIE(InfoExtractor): None if categories_str is None else categories_str.split(',')) - return { + video_data.update({ 'id': video_id, - 'url': video_url, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'display_id': display_id, + 'description': self._html_search_meta('description', webpage), 'categories': categories, 'view_count': view_count, 'age_limit': 18, - } + }) + + return video_data diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py index 2da93ed34..b5c279203 100644 --- a/youtube_dl/extractor/presstv.py +++ b/youtube_dl/extractor/presstv.py @@ -19,7 +19,7 @@ class PressTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Organic mattresses used to clean waste water', 'upload_date': '20160409', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'description': 'md5:20002e654bbafb6908395a5c0cfcd125' } } diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py index f93bd19ff..23ac93d7e 100644 --- a/youtube_dl/extractor/promptfile.py +++ b/youtube_dl/extractor/promptfile.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( determine_ext, ExtractorError, - sanitized_Request, urlencode_postdata, ) @@ -15,13 +14,13 @@ from ..utils import ( class PromptFileIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?promptfile\.com/l/(?P<id>[0-9A-Z\-]+)' _TEST = { - 'url': 'http://www.promptfile.com/l/D21B4746E9-F01462F0FF', - 'md5': 'd1451b6302da7215485837aaea882c4c', + 'url': 'http://www.promptfile.com/l/86D1CE8462-576CAAE416', + 'md5': '5a7e285a26e0d66d9a263fae91bc92ce', 'info_dict': { - 'id': 'D21B4746E9-F01462F0FF', + 'id': '86D1CE8462-576CAAE416', 'ext': 'mp4', - 'title': 'Birds.mp4', - 'thumbnail': 're:^https?://.*\.jpg$', + 'title': 'oceans.mp4', + 'thumbnail': r're:^https?://.*\.jpg$', } } @@ -33,14 +32,23 @@ class PromptFileIE(InfoExtractor): raise ExtractorError('Video %s does not exist' % video_id, expected=True) + chash = self._search_regex( + r'val\("([^"]*)"\s*\+\s*\$\("#chash"\)', webpage, 'chash') fields = self._hidden_inputs(webpage) - post = urlencode_postdata(fields) - req = sanitized_Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - webpage = self._download_webpage( - req, video_id, 'Downloading video page') + keys = list(fields.keys()) + chash_key = keys[0] if len(keys) == 1 else next( + key for key in keys if key.startswith('cha')) + fields[chash_key] = chash + fields[chash_key] - url = self._html_search_regex(r'url:\s*\'([^\']+)\'', webpage, 'URL') + webpage = self._download_webpage( + url, video_id, 'Downloading video page', + data=urlencode_postdata(fields), + headers={'Content-type': 'application/x-www-form-urlencoded'}) + + video_url = self._search_regex( + (r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*Download File', + r'<a[^>]+href=(["\'])(?P<url>https?://(?:www\.)?promptfile\.com/file/(?:(?!\1).)+)\1'), + webpage, 'video url', group='url') title = self._html_search_regex( r'<span.+title="([^"]+)">', webpage, 'title') thumbnail = self._html_search_regex( @@ -49,7 +57,7 @@ class PromptFileIE(InfoExtractor): formats = [{ 'format_id': 'sd', - 'url': url, + 'url': video_url, 'ext': determine_ext(title), }] self._sort_formats(formats) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index c6eee3b72..5091d8456 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -15,10 +15,127 @@ from ..utils import ( ) -class ProSiebenSat1IE(InfoExtractor): +class ProSiebenSat1BaseIE(InfoExtractor): + def _extract_video_info(self, url, clip_id): + client_location = url + + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': self._TOKEN, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'ids': clip_id, + })[0] + + if video.get('is_protected') is True: + raise ExtractorError('This video is DRM protected.', expected=True) + + duration = float_or_none(video.get('duration')) + source_ids = [compat_str(source['id']) for source in video['sources']] + + client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + }) + server_id = sources['server_id'] + + def fix_bitrate(bitrate): + bitrate = int_or_none(bitrate) + if not bitrate: + return None + return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate + + formats = [] + for source_id in source_ids: + client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'server_id': server_id, + 'source_ids': source_id, + }) + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: + continue + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif mimetype == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id='dash', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) + self._sort_formats(formats) + + return { + 'duration': duration, + 'formats': formats, + } + + +class ProSiebenSat1IE(ProSiebenSat1BaseIE): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany|7tv)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + (?: + prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|7tv|advopedia + )\.(?:de|at|ch)| + ran\.de|fem\.com|advopedia\.de + ) + /(?P<id>.+) + ''' _TESTS = [ { @@ -30,16 +147,12 @@ class ProSiebenSat1IE(InfoExtractor): 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', 'info_dict': { 'id': '2104602', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Episode 18 - Staffel 2', 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 'upload_date': '20131231', 'duration': 5845.04, }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', @@ -141,7 +254,7 @@ class ProSiebenSat1IE(InfoExtractor): 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', 'info_dict': { 'id': '2572814', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Andreas Kümmert: Rocket Man', 'description': 'md5:6ddb02b0781c6adf778afea606652e38', 'upload_date': '20131017', @@ -155,7 +268,7 @@ class ProSiebenSat1IE(InfoExtractor): 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html', 'info_dict': { 'id': '2156342', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Kurztrips zum Valentinstag', 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', 'duration': 307.24, @@ -172,12 +285,13 @@ class ProSiebenSat1IE(InfoExtractor): 'description': 'md5:63b8963e71f481782aeea877658dec84', }, 'playlist_count': 2, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.7tv.de/circus-halligalli/615-best-of-circus-halligalli-ganze-folge', 'info_dict': { 'id': '4187506', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Best of Circus HalliGalli', 'description': 'md5:8849752efd90b9772c9db6fdf87fb9e9', 'upload_date': '20151229', @@ -186,8 +300,29 @@ class ProSiebenSat1IE(InfoExtractor): 'skip_download': True, }, }, + { + # geo restricted to Germany + 'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', + 'only_matching': True, + }, + { + 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', + 'only_matching': True, + }, + { + 'url': 'http://www.advopedia.de/videos/lenssen-klaert-auf/lenssen-klaert-auf-folge-8-staffel-3-feiertage-und-freie-tage', + 'only_matching': True, + }, ] + _TOKEN = 'prosieben' + _SALT = '01!8d8F_)r9]4s[qeuXfP%' + _CLIENT_NAME = 'kolibri-2.0.19-splec4' _CLIPID_REGEXES = [ r'"clip_id"\s*:\s+"(\d+)"', r'clipid: "(\d+)"', @@ -234,140 +369,50 @@ class ProSiebenSat1IE(InfoExtractor): def _extract_clip(self, url, webpage): clip_id = self._html_search_regex( self._CLIPID_REGEXES, webpage, 'clip id') - - access_token = 'prosieben' - client_name = 'kolibri-2.0.19-splec4' - client_location = url - - video = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos', - clip_id, 'Downloading videos JSON', query={ - 'access_token': access_token, - 'client_location': client_location, - 'client_name': client_name, - 'ids': clip_id, - })[0] - - if video.get('is_protected') is True: - raise ExtractorError('This video is DRM protected.', expected=True) - - duration = float_or_none(video.get('duration')) - source_ids = [compat_str(source['id']) for source in video['sources']] - - g = '01!8d8F_)r9]4s[qeuXfP%' - client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]).encode('utf-8')).hexdigest() - - sources = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, - clip_id, 'Downloading sources JSON', query={ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - }) - server_id = sources['server_id'] - title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') - - def fix_bitrate(bitrate): - bitrate = int_or_none(bitrate) - if not bitrate: - return None - return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate - - formats = [] - for source_id in source_ids: - client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, client_location, source_id, g, client_name]).encode('utf-8')).hexdigest() - urls = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, - clip_id, 'Downloading urls JSON', fatal=False, query={ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - 'server_id': server_id, - 'source_ids': source_id, - }) - if not urls: - continue - if urls.get('status_code') != 0: - raise ExtractorError('This video is unavailable', expected=True) - urls_sources = urls['sources'] - if isinstance(urls_sources, dict): - urls_sources = urls_sources.values() - for source in urls_sources: - source_url = source.get('url') - if not source_url: - continue - protocol = source.get('protocol') - mimetype = source.get('mimetype') - if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': - formats.extend(self._extract_f4m_formats( - source_url, clip_id, f4m_id='hds', fatal=False)) - elif mimetype == 'application/x-mpegURL': - formats.extend(self._extract_m3u8_formats( - source_url, clip_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - tbr = fix_bitrate(source['bitrate']) - if protocol in ('rtmp', 'rtmpe'): - mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) - if not mobj: - continue - path = mobj.group('path') - mp4colon_index = path.rfind('mp4:') - app = path[:mp4colon_index] - play_path = path[mp4colon_index:] - formats.append({ - 'url': '%s/%s' % (mobj.group('url'), app), - 'app': app, - 'play_path': play_path, - 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', - 'page_url': 'http://www.prosieben.de', - 'tbr': tbr, - 'ext': 'flv', - 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), - }) - else: - formats.append({ - 'url': source_url, - 'tbr': tbr, - 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), - }) - self._sort_formats(formats) - + info = self._extract_video_info(url, clip_id) description = self._html_search_regex( - self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + self._DESCRIPTION_REGEXES, webpage, 'description', default=None) + if description is None: + description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._html_search_regex( self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) - return { + info.update({ 'id': clip_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, - 'duration': duration, - 'formats': formats, - } + }) + return info def _extract_playlist(self, url, webpage): playlist_id = self._html_search_regex( self._PLAYLIST_ID_REGEXES, webpage, 'playlist id') - for regex in self._PLAYLIST_CLIP_REGEXES: - playlist_clips = re.findall(regex, webpage) - if playlist_clips: - title = self._html_search_regex( - self._TITLE_REGEXES, webpage, 'title') - description = self._html_search_regex( - self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) - entries = [ - self.url_result( - re.match('(.+?//.+?)/', url).group(1) + clip_path, - 'ProSiebenSat1') - for clip_path in playlist_clips] - return self.playlist_result(entries, playlist_id, title, description) + playlist = self._parse_json( + self._search_regex( + r'var\s+contentResources\s*=\s*(\[.+?\]);\s*</script', + webpage, 'playlist'), + playlist_id) + entries = [] + for item in playlist: + clip_id = item.get('id') or item.get('upc') + if not clip_id: + continue + info = self._extract_video_info(url, clip_id) + info.update({ + 'id': clip_id, + 'title': item.get('title') or item.get('teaser', {}).get('headline'), + 'description': item.get('teaser', {}).get('description'), + 'thumbnail': item.get('poster'), + 'duration': float_or_none(item.get('duration')), + 'series': item.get('tvShowTitle'), + 'uploader': item.get('broadcastPublisher'), + }) + entries.append(info) + return self.playlist_result(entries, playlist_id) def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py index fca30e1aa..80091b85f 100644 --- a/youtube_dl/extractor/puls4.py +++ b/youtube_dl/extractor/puls4.py @@ -1,88 +1,57 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor +from .prosiebensat1 import ProSiebenSat1BaseIE from ..utils import ( - ExtractorError, unified_strdate, - int_or_none, + parse_duration, + compat_str, ) -class Puls4IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?puls4\.com/video/[^/]+/play/(?P<id>[0-9]+)' +class Puls4IE(ProSiebenSat1BaseIE): + _VALID_URL = r'https?://(?:www\.)?puls4\.com/(?P<id>[^?#&]+)' _TESTS = [{ - 'url': 'http://www.puls4.com/video/pro-und-contra/play/2716816', - 'md5': '49f6a6629747eeec43cef6a46b5df81d', + 'url': 'http://www.puls4.com/2-minuten-2-millionen/staffel-3/videos/2min2miotalk/Tobias-Homberger-von-myclubs-im-2min2miotalk-118118', + 'md5': 'fd3c6b0903ac72c9d004f04bc6bb3e03', 'info_dict': { - 'id': '2716816', - 'ext': 'mp4', - 'title': 'Pro und Contra vom 23.02.2015', - 'description': 'md5:293e44634d9477a67122489994675db6', - 'duration': 2989, - 'upload_date': '20150224', + 'id': '118118', + 'ext': 'flv', + 'title': 'Tobias Homberger von myclubs im #2min2miotalk', + 'description': 'md5:f9def7c5e8745d6026d8885487d91955', + 'upload_date': '20160830', 'uploader': 'PULS_4', }, - 'skip': 'Only works from Germany', }, { - 'url': 'http://www.puls4.com/video/kult-spielfilme/play/1298106', - 'md5': '6a48316c8903ece8dab9b9a7bf7a59ec', - 'info_dict': { - 'id': '1298106', - 'ext': 'mp4', - 'title': 'Lucky Fritz', - }, - 'skip': 'Only works from Germany', + 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident.-Norbert-Hofer', + 'only_matching': True, + }, { + 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident-Analyse-des-Interviews-mit-Norbert-Hofer-416598', + 'only_matching': True, }] + _TOKEN = 'puls4' + _SALT = '01!kaNgaiNgah1Ie4AeSha' + _CLIENT_NAME = '' def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - error_message = self._html_search_regex( - r'<div[^>]+class="message-error"[^>]*>(.+?)</div>', - webpage, 'error message', default=None) - if error_message: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) - - real_url = self._html_search_regex( - r'\"fsk-button\".+?href=\"([^"]+)', - webpage, 'fsk_button', default=None) - if real_url: - webpage = self._download_webpage(real_url, video_id) - - player = self._search_regex( - r'p4_video_player(?:_iframe)?\("video_\d+_container"\s*,(.+?)\);\s*\}', - webpage, 'player') - - player_json = self._parse_json( - '[%s]' % player, video_id, - transform_source=lambda s: s.replace('undefined,', '')) - - formats = None - result = None - - for v in player_json: - if isinstance(v, list) and not formats: - formats = [{ - 'url': f['url'], - 'format': 'hd' if f.get('hd') else 'sd', - 'width': int_or_none(f.get('size_x')), - 'height': int_or_none(f.get('size_y')), - 'tbr': int_or_none(f.get('bitrate')), - } for f in v] - self._sort_formats(formats) - elif isinstance(v, dict) and not result: - result = { - 'id': video_id, - 'title': v['videopartname'].strip(), - 'description': v.get('videotitle'), - 'duration': int_or_none(v.get('videoduration') or v.get('episodeduration')), - 'upload_date': unified_strdate(v.get('clipreleasetime')), - 'uploader': v.get('channel'), - } - - result['formats'] = formats - - return result + path = self._match_id(url) + content_path = self._download_json( + 'http://www.puls4.com/api/json-fe/page/' + path, path)['content'][0]['url'] + media = self._download_json( + 'http://www.puls4.com' + content_path, + content_path)['mediaCurrent'] + player_content = media['playerContent'] + info = self._extract_video_info(url, player_content['id']) + info.update({ + 'id': compat_str(media['objectId']), + 'title': player_content['title'], + 'description': media.get('description'), + 'thumbnail': media.get('previewLink'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(player_content.get('duration')), + 'episode': player_content.get('episodePartName'), + 'show': media.get('channel'), + 'season_id': player_content.get('seasonId'), + 'uploader': player_content.get('sourceCompany'), + }) + return info diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index cc0416cb8..b8ac93a62 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -1,59 +1,72 @@ from __future__ import unicode_literals import re -import os from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none class PyvideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P<category>[^/]+)/(?P<id>[^/?#&.]+)' - _TESTS = [ - { - 'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', - 'md5': '520915673e53a5c5d487c36e0c4d85b5', - 'info_dict': { - 'id': '24_4WWkSmNo', - 'ext': 'webm', - 'title': 'Become a logging expert in 30 minutes', - 'description': 'md5:9665350d466c67fb5b1598de379021f7', - 'upload_date': '20130320', - 'uploader': 'Next Day Video', - 'uploader_id': 'NextDayVideo', - }, - 'add_ie': ['Youtube'], + _TESTS = [{ + 'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html', + 'info_dict': { + 'id': 'become-a-logging-expert-in-30-minutes', }, - { - 'url': 'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v', - 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', - 'info_dict': { - 'id': '2542', - 'ext': 'm4v', - 'title': 'Gloriajw-SpotifyWithErikBernhardsson182', - }, + 'playlist_count': 2, + }, { + 'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html', + 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', + 'info_dict': { + 'id': '2542', + 'ext': 'm4v', + 'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v', }, - ] + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + category = mobj.group('category') video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + entries = [] - m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage) - if m_youtube is not None: - return self.url_result(m_youtube.group(1), 'Youtube') + data = self._download_json( + 'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json' + % (category, video_id), video_id, fatal=False) - title = self._html_search_regex( - r'<div class="section">\s*<h3(?:\s+class="[^"]*"[^>]*)?>([^>]+?)</h3>', - webpage, 'title', flags=re.DOTALL) - video_url = self._search_regex( - [r'<source src="(.*?)"', r'<dt>Download</dt>.*?<a href="(.+?)"'], - webpage, 'video url', flags=re.DOTALL) + if data: + for video in data['videos']: + video_url = video.get('url') + if video_url: + if video.get('type') == 'youtube': + entries.append(self.url_result(video_url, 'Youtube')) + else: + entries.append({ + 'id': compat_str(data.get('id') or video_id), + 'url': video_url, + 'title': data['title'], + 'description': data.get('description') or data.get('summary'), + 'thumbnail': data.get('thumbnail_url'), + 'duration': int_or_none(data.get('duration')), + }) + else: + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + media_urls = self._search_regex( + r'(?s)Media URL:(.+?)</li>', webpage, 'media urls') + for m in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>http.+?)\1', media_urls): + media_url = m.group('url') + if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url): + entries.append(self.url_result(media_url, 'Youtube')) + else: + entries.append({ + 'id': video_id, + 'url': media_url, + 'title': title, + }) - return { - 'id': video_id, - 'title': os.path.splitext(title)[0], - 'url': video_url, - } + return self.playlist_result(entries, video_id) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index ff0af9543..17c27da46 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -18,7 +18,7 @@ from ..utils import ( class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'https?://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', 'md5': '9ce1c1c8445f561506d2e3cfb0255705', @@ -29,7 +29,7 @@ class QQMusicIE(InfoExtractor): 'release_date': '20141227', 'creator': '林俊杰', 'description': 'md5:d327722d0361576fde558f1ac68a7065', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'note': 'There is no mp3-320 version of this song.', @@ -42,7 +42,7 @@ class QQMusicIE(InfoExtractor): 'release_date': '20050626', 'creator': '李季美', 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'note': 'lyrics not in .lrc format', @@ -54,7 +54,7 @@ class QQMusicIE(InfoExtractor): 'release_date': '19970225', 'creator': 'Dark Funeral', 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, @@ -172,7 +172,7 @@ class QQPlaylistBaseIE(InfoExtractor): class QQMusicSingerIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:singer' IE_DESC = 'QQ音乐 - 歌手' - _VALID_URL = r'https?://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' _TEST = { 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', 'info_dict': { @@ -217,7 +217,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'https?://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py index 069dbfaed..ed38c77eb 100644 --- a/youtube_dl/extractor/r7.py +++ b/youtube_dl/extractor/r7.py @@ -23,7 +23,7 @@ class R7IE(InfoExtractor): 'ext': 'mp4', 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', 'description': 'md5:01812008664be76a6479aa58ec865b72', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 98, 'like_count': int, 'view_count': int, diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py index 0cbb15f08..2c35f9845 100644 --- a/youtube_dl/extractor/radiobremen.py +++ b/youtube_dl/extractor/radiobremen.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals @@ -13,15 +13,15 @@ class RadioBremenIE(InfoExtractor): IE_NAME = 'radiobremen' _TEST = { - 'url': 'http://www.radiobremen.de/mediathek/index.html?id=114720', + 'url': 'http://www.radiobremen.de/mediathek/?id=141876', 'info_dict': { - 'id': '114720', + 'id': '141876', 'ext': 'mp4', - 'duration': 1685, + 'duration': 178, 'width': 512, - 'title': 'buten un binnen vom 22. Dezember', - 'thumbnail': 're:https?://.*\.jpg$', - 'description': 'Unter anderem mit diesen Themen: 45 Flüchtlinge sind in Worpswede angekommen +++ Freies Internet für alle: Bremer arbeiten an einem flächendeckenden W-Lan-Netzwerk +++ Aktivisten kämpfen für das Unibad +++ So war das Wetter 2014 +++', + 'title': 'Druck auf Patrick Öztürk', + 'thumbnail': r're:https?://.*\.jpg$', + 'description': 'Gegen den SPD-Bürgerschaftsabgeordneten Patrick Öztürk wird wegen Beihilfe zum gewerbsmäßigen Betrug ermittelt. Am Donnerstagabend sollte er dem Vorstand des SPD-Unterbezirks Bremerhaven dazu Rede und Antwort stehen.', }, } diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 8ec402646..321917ad0 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -13,6 +13,7 @@ from ..utils import ( xpath_element, ExtractorError, determine_protocol, + unsmuggle_url, ) @@ -35,28 +36,51 @@ class RadioCanadaIE(InfoExtractor): } def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) app_code, video_id = re.match(self._VALID_URL, url).groups() - device_types = ['ipad', 'android'] + metadata = self._download_xml( + 'http://api.radio-canada.ca/metaMedia/v1/index.ashx', + video_id, note='Downloading metadata XML', query={ + 'appCode': app_code, + 'idMedia': video_id, + }) + + def get_meta(name): + el = find_xpath_attr(metadata, './/Meta', 'name', name) + return el.text if el is not None else None + + if get_meta('protectionType'): + raise ExtractorError('This video is DRM protected.', expected=True) + + device_types = ['ipad'] if app_code != 'toutv': device_types.append('flash') + if not smuggled_data: + device_types.append('android') formats = [] # TODO: extract f4m formats # f4m formats can be extracted using flashhd device_type but they produce unplayable file for device_type in device_types: - v_data = self._download_xml( - 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx', - video_id, note='Downloading %s XML' % device_type, query={ - 'appCode': app_code, - 'idMedia': video_id, - 'connectionType': 'broadband', - 'multibitrate': 'true', - 'deviceType': device_type, + validation_url = 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx' + query = { + 'appCode': app_code, + 'idMedia': video_id, + 'connectionType': 'broadband', + 'multibitrate': 'true', + 'deviceType': device_type, + } + if smuggled_data: + validation_url = 'https://services.radio-canada.ca/media/validation/v2/' + query.update(smuggled_data) + else: + query.update({ # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction 'paysJ391wsHjbOJwvCs26toz': 'CA', 'bypasslock': 'NZt5K62gRqfc', - }, fatal=False) + }) + v_data = self._download_xml(validation_url, video_id, note='Downloading %s XML' % device_type, query=query, fatal=False) v_url = xpath_text(v_data, 'url') if not v_url: continue @@ -101,16 +125,13 @@ class RadioCanadaIE(InfoExtractor): f4m_id='hds', fatal=False)) self._sort_formats(formats) - metadata = self._download_xml( - 'http://api.radio-canada.ca/metaMedia/v1/index.ashx', - video_id, note='Downloading metadata XML', query={ - 'appCode': app_code, - 'idMedia': video_id, - }) - - def get_meta(name): - el = find_xpath_attr(metadata, './/Meta', 'name', name) - return el.text if el is not None else None + subtitles = {} + closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5') + if closed_caption_url: + subtitles['fr'] = [{ + 'url': closed_caption_url, + 'ext': determine_ext(closed_caption_url, 'vtt'), + }] return { 'id': video_id, @@ -122,6 +143,7 @@ class RadioCanadaIE(InfoExtractor): 'season_number': int_or_none('SrcSaison'), 'episode_number': int_or_none('SrcEpisode'), 'upload_date': unified_strdate(get_meta('Date')), + 'subtitles': subtitles, 'formats': formats, } diff --git a/youtube_dl/extractor/radiode.py b/youtube_dl/extractor/radiode.py index aa5f6f8ad..2c06c8b1e 100644 --- a/youtube_dl/extractor/radiode.py +++ b/youtube_dl/extractor/radiode.py @@ -13,7 +13,7 @@ class RadioDeIE(InfoExtractor): 'ext': 'mp3', 'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'md5:591c49c702db1a33751625ebfb67f273', - 'thumbnail': 're:^https?://.*\.png', + 'thumbnail': r're:^https?://.*\.png', 'is_live': True, }, 'params': { diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py index ec4fa6e60..a53ad97a5 100644 --- a/youtube_dl/extractor/radiojavan.py +++ b/youtube_dl/extractor/radiojavan.py @@ -18,7 +18,7 @@ class RadioJavanIE(InfoExtractor): 'id': 'chaartaar-ashoobam', 'ext': 'mp4', 'title': 'Chaartaar - Ashoobam', - 'thumbnail': 're:^https?://.*\.jpe?g$', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'upload_date': '20150215', 'view_count': int, 'like_count': int, diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index dc640b1bc..41afbd9af 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -120,7 +120,7 @@ class RaiTVIE(RaiBaseIE): 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'upload_date': '20140407', 'duration': 6160, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { @@ -133,7 +133,7 @@ class RaiTVIE(RaiBaseIE): 'title': 'TG PRIMO TEMPO', 'upload_date': '20140612', 'duration': 1758, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'Geo-restricted to Italy', }, @@ -169,7 +169,7 @@ class RaiTVIE(RaiBaseIE): 'description': 'md5:364b604f7db50594678f483353164fb8', 'upload_date': '20140923', 'duration': 386, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, ] diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 7932af6ef..53b82fba3 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -1,55 +1,71 @@ -# encoding: utf-8 from __future__ import unicode_literals -import json import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - ExtractorError, + clean_html, + int_or_none, + unified_timestamp, + update_url_query, ) class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)' _TEST = { - 'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', + 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', 'info_dict': { 'id': 'ford-lopatin-live-at-primavera-sound-2011', 'ext': 'mp3', - 'uploader_id': 'ford-lopatin', - 'location': 'Spain', - 'description': 'Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.', - 'uploader': 'Ford & Lopatin', - 'title': 'Live at Primavera Sound 2011', + 'title': 'Main Stage - Ford & Lopatin', + 'description': 'md5:4f340fb48426423530af5a9d87bd7b91', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2452, + 'timestamp': 1307103164, + 'upload_date': '20110603', }, } def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') + mobj = re.match(self._VALID_URL, url) + show_id = mobj.group('show_id') + episode_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, episode_id) - json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', - webpage, 'json data', flags=re.MULTILINE) + episode = self._parse_json( + self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*</script>', + webpage, 'json data'), + episode_id)['episodes'][show_id][episode_id] - try: - data = json.loads(json_data) - except ValueError as e: - raise ExtractorError('Invalid JSON: ' + str(e)) + title = episode['title'] - video_url = data['akamai_url'] + '&cbr=256' + show_title = episode.get('showTitle') + if show_title: + title = '%s - %s' % (show_title, title) + + formats = [{ + 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), + 'format_id': compat_str(abr), + 'abr': abr, + 'vcodec': 'none', + } for abr in (96, 128, 256)] + + description = clean_html(episode.get('longTeaser')) + thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) + duration = int_or_none(episode.get('duration')) + timestamp = unified_timestamp(episode.get('publishedAt')) return { - 'id': video_id, - 'url': video_url, - 'title': data['title'], - 'description': data.get('teaser_text'), - 'location': data.get('country_of_origin'), - 'uploader': data.get('host', {}).get('name'), - 'uploader_id': data.get('host', {}).get('slug'), - 'thumbnail': data.get('image', {}).get('large_url_2x'), - 'duration': data.get('duration'), + 'id': episode_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 721fc3a9e..c367a6ae7 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -10,8 +12,8 @@ from ..utils import ( class RedTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.redtube.com/66418', 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', 'info_dict': { @@ -23,11 +25,21 @@ class RedTubeIE(InfoExtractor): 'view_count': int, 'age_limit': 18, } - } + }, { + 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)', + webpage) def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://www.redtube.com/%s' % video_id, video_id) if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): raise ExtractorError('Video %s has been removed' % video_id, expected=True) diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py new file mode 100644 index 000000000..422c02cff --- /dev/null +++ b/youtube_dl/extractor/rentv.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .jwplatform import JWPlatformBaseIE +from ..compat import compat_str + + +class RENTVIE(JWPlatformBaseIE): + _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://ren.tv/video/epizod/118577', + 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': '118577', + 'ext': 'mp4', + 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"' + } + }, { + 'url': 'http://ren.tv/player/118577', + 'only_matching': True, + }, { + 'url': 'rentv:118577', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) + jw_config = self._parse_json(self._search_regex( + r'config\s*=\s*({.+});', webpage, 'jw config'), video_id) + return self._parse_jwplayer_data(jw_config, video_id, m3u8_id='hls') + + +class RENTVArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ren\.tv/novosti/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'http://ren.tv/novosti/2016-10-26/video-mikroavtobus-popavshiy-v-dtp-s-gruzovikami-v-podmoskove-prevratilsya-v', + 'md5': 'ebd63c4680b167693745ab91343df1d6', + 'info_dict': { + 'id': '136472', + 'ext': 'mp4', + 'title': 'Видео: микроавтобус, попавший в ДТП с грузовиками в Подмосковье, превратился в груду металла', + 'description': 'Жертвами столкновения двух фур и микроавтобуса, по последним данным, стали семь человек.', + } + }, { + # TODO: invalid m3u8 + 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', + 'info_dict': { + 'id': 'playlist', + 'ext': 'mp4', + 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', + 'uploader': 'ren.tv', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + + entries = [] + for config_profile in drupal_settings.get('ren_jwplayer', {}).values(): + media_id = config_profile.get('mediaid') + if not media_id: + continue + media_id = compat_str(media_id) + entries.append(self.url_result('rentv:' + media_id, 'RENTV', media_id)) + return self.playlist_result(entries, display_id) diff --git a/youtube_dl/extractor/reuters.py b/youtube_dl/extractor/reuters.py index 961d504eb..9dc482d21 100644 --- a/youtube_dl/extractor/reuters.py +++ b/youtube_dl/extractor/reuters.py @@ -32,7 +32,7 @@ class ReutersIE(InfoExtractor): webpage, 'video data')) def get_json_value(key, fatal=False): - return self._search_regex('"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal) + return self._search_regex(r'"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal) title = unescapeHTML(get_json_value('title', fatal=True)) mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups() diff --git a/youtube_dl/extractor/reverbnation.py b/youtube_dl/extractor/reverbnation.py index 3c6725aeb..4cb99c244 100644 --- a/youtube_dl/extractor/reverbnation.py +++ b/youtube_dl/extractor/reverbnation.py @@ -1,29 +1,29 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import str_or_none +from ..utils import ( + qualities, + str_or_none, +) class ReverbNationIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' _TESTS = [{ 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', - 'md5': '3da12ebca28c67c111a7f8b262d3f7a7', + 'md5': 'c0aaf339bcee189495fdf5a8c8ba8645', 'info_dict': { 'id': '16965047', 'ext': 'mp3', 'title': 'MONA LISA', 'uploader': 'ALKILADOS', 'uploader_id': '216429', - 'thumbnail': 're:^https://gp1\.wac\.edgecastcdn\.net/.*?\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg', }, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - song_id = mobj.group('id') + song_id = self._match_id(url) api_res = self._download_json( 'https://api.reverbnation.com/song/%s' % song_id, @@ -31,14 +31,23 @@ class ReverbNationIE(InfoExtractor): note='Downloading information of song %s' % song_id ) + THUMBNAILS = ('thumbnail', 'image') + quality = qualities(THUMBNAILS) + thumbnails = [] + for thumb_key in THUMBNAILS: + if api_res.get(thumb_key): + thumbnails.append({ + 'url': api_res[thumb_key], + 'preference': quality(thumb_key) + }) + return { 'id': song_id, - 'title': api_res.get('name'), - 'url': api_res.get('url'), + 'title': api_res['name'], + 'url': api_res['url'], 'uploader': api_res.get('artist', {}).get('name'), 'uploader_id': str_or_none(api_res.get('artist', {}).get('id')), - 'thumbnail': self._proto_relative_url( - api_res.get('image', api_res.get('thumbnail'))), + 'thumbnails': thumbnails, 'ext': 'mp3', 'vcodec': 'none', } diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py new file mode 100644 index 000000000..2340dae53 --- /dev/null +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) + + +class RMCDecouverteIE(InfoExtractor): + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)' + + _TEST = { + 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE', + 'info_dict': { + 'id': '5111223049001', + 'ext': 'mp4', + 'title': ': LES HEROS DU 88e ETAGE', + 'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé la vie d\'innombrables personnes le 11 septembre 2001.', + 'uploader_id': '1969646226001', + 'upload_date': '20160904', + 'timestamp': 1472951103, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Only works from France', + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py index 962b524e9..69934ef2b 100644 --- a/youtube_dl/extractor/ro220.py +++ b/youtube_dl/extractor/ro220.py @@ -14,7 +14,7 @@ class Ro220IE(InfoExtractor): 'id': 'LYV6doKo7f', 'ext': 'mp4', 'title': 'Luati-le Banii sez 4 ep 1', - 'description': 're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$', + 'description': r're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$', } } diff --git a/youtube_dl/extractor/rockstargames.py b/youtube_dl/extractor/rockstargames.py index 48128e219..cd6904bc9 100644 --- a/youtube_dl/extractor/rockstargames.py +++ b/youtube_dl/extractor/rockstargames.py @@ -18,7 +18,7 @@ class RockstarGamesIE(InfoExtractor): 'ext': 'mp4', 'title': 'Further Adventures in Finance and Felony Trailer', 'description': 'md5:6d31f55f30cb101b5476c4a379e324a3', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1464876000, 'upload_date': '20160602', } diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index f5b2f560c..46dfc78f5 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -26,7 +26,7 @@ class RoosterTeethIE(InfoExtractor): 'ext': 'mp4', 'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement', 'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5', - 'thumbnail': 're:^https?://.*\.png$', + 'thumbnail': r're:^https?://.*\.png$', 'series': 'Million Dollars, But...', 'episode': 'Million Dollars, But... The Game Announcement', 'comment_count': int, diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py index f9cd48790..14c8e8236 100644 --- a/youtube_dl/extractor/rottentomatoes.py +++ b/youtube_dl/extractor/rottentomatoes.py @@ -1,31 +1,32 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse from .internetvideoarchive import InternetVideoArchiveIE class RottenTomatoesIE(InfoExtractor): - _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' _TEST = { 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', 'info_dict': { - 'id': '613340', + 'id': '11028566', 'ext': 'mp4', 'title': 'Toy Story 3', + 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'thumbnail': r're:^https?://.*\.jpg$', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - og_video = self._og_search_video_url(webpage) - query = compat_urlparse.urlparse(og_video).query + iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id') return { '_type': 'url_transparent', - 'url': InternetVideoArchiveIE._build_xml_url(query), + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id, 'ie_key': InternetVideoArchiveIE.ie_key(), + 'id': video_id, 'title': self._og_search_title(webpage), } diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py index 41638c1d0..65284643b 100644 --- a/youtube_dl/extractor/roxwel.py +++ b/youtube_dl/extractor/roxwel.py @@ -7,7 +7,7 @@ from ..utils import unified_strdate, determine_ext class RoxwelIE(InfoExtractor): - _VALID_URL = r'https?://www\.roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' + _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' _TEST = { 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', diff --git a/youtube_dl/extractor/rozhlas.py b/youtube_dl/extractor/rozhlas.py new file mode 100644 index 000000000..f8eda8dea --- /dev/null +++ b/youtube_dl/extractor/rozhlas.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_start, +) + + +class RozhlasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://prehravac.rozhlas.cz/audio/3421320', + 'md5': '504c902dbc9e9a1fd50326eccf02a7e2', + 'info_dict': { + 'id': '3421320', + 'ext': 'mp3', + 'title': 'Echo Pavla Klusáka (30.06.2015 21:00)', + 'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let' + } + }, { + 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', + 'skip_download': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id) + + title = self._html_search_regex( + r'<h3>(.+?)</h3>\s*<p[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', + webpage, 'title', default=None) or remove_start( + self._og_search_title(webpage), 'Radio Wave - ') + description = self._html_search_regex( + r'<p[^>]+title=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', + webpage, 'description', fatal=False, group='url') + duration = int_or_none(self._search_regex( + r'data-duration=["\'](\d+)', webpage, 'duration', default=None)) + + return { + 'id': audio_id, + 'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'vcodec': 'none', + } diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index ebe563ebb..a6fac6c35 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -4,118 +4,31 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( float_or_none, parse_iso8601, unescapeHTML, + ExtractorError, ) -class RteIE(InfoExtractor): - IE_NAME = 'rte' - IE_DESC = 'Raidió Teilifís Éireann TV' - _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', - 'info_dict': { - 'id': '10478715', - 'ext': 'flv', - 'title': 'Watch iWitness online', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.', - 'duration': 60.046, - }, - 'params': { - 'skip_download': 'f4m fails with --test atm' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage) - description = self._html_search_meta('description', webpage, 'description') - duration = float_or_none(self._html_search_meta( - 'duration', webpage, 'duration', fatal=False), 1000) - - thumbnail = None - thumbnail_meta = self._html_search_meta('thumbnail', webpage) - if thumbnail_meta: - thumbnail_id = self._search_regex( - r'uri:irus:(.+)', thumbnail_meta, - 'thumbnail id', fatal=False) - if thumbnail_id: - thumbnail = 'http://img.rasset.ie/%s.jpg' % thumbnail_id - - feeds_url = self._html_search_meta('feeds-prefix', webpage, 'feeds url') + video_id - json_string = self._download_json(feeds_url, video_id) - - # f4m_url = server + relative_url - f4m_url = json_string['shows'][0]['media:group'][0]['rte:server'] + json_string['shows'][0]['media:group'][0]['url'] - f4m_formats = self._extract_f4m_formats(f4m_url, video_id) - self._sort_formats(f4m_formats) - - return { - 'id': video_id, - 'title': title, - 'formats': f4m_formats, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - } - - -class RteRadioIE(InfoExtractor): - IE_NAME = 'rte:radio' - IE_DESC = 'Raidió Teilifís Éireann radio' - # Radioplayer URLs have two distinct specifier formats, - # the old format #!rii=<channel_id>:<id>:<playable_item_id>:<date>: - # the new format #!rii=b<channel_id>_<id>_<playable_item_id>_<date>_ - # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated. - # An <id> uniquely defines an individual recording, and is the only part we require. - _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:b?[0-9]*)(?:%3A|:|%5F|_)(?P<id>[0-9]+)' - - _TESTS = [{ - # Old-style player URL; HLS and RTMPE formats - 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:', - 'info_dict': { - 'id': '10507902', - 'ext': 'mp4', - 'title': 'Gloria', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0', - 'timestamp': 1451203200, - 'upload_date': '20151227', - 'duration': 7230.0, - }, - 'params': { - 'skip_download': 'f4m fails with --test atm' - } - }, { - # New-style player URL; RTMPE formats only - 'url': 'http://rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=b16_3250678_8861_06-04-2012_', - 'info_dict': { - 'id': '3250678', - 'ext': 'flv', - 'title': 'The Lyric Concert with Paul Herriott', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': '', - 'timestamp': 1333742400, - 'upload_date': '20120406', - 'duration': 7199.016, - }, - 'params': { - 'skip_download': 'f4m fails with --test atm' - } - }] - +class RteBaseIE(InfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) - json_string = self._download_json( - 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id, - item_id) + try: + json_string = self._download_json( + 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id, + item_id) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) + if error_info: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_info['message']), + expected=True) + raise # NB the string values in the JSON are stored using XML escaping(!) show = json_string['shows'][0] @@ -163,3 +76,67 @@ class RteRadioIE(InfoExtractor): 'duration': duration, 'formats': formats, } + + +class RteIE(RteBaseIE): + IE_NAME = 'rte' + IE_DESC = 'Raidió Teilifís Éireann TV' + _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', + 'md5': '4a76eb3396d98f697e6e8110563d2604', + 'info_dict': { + 'id': '10478715', + 'ext': 'mp4', + 'title': 'iWitness', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'The spirit of Ireland, one voice and one minute at a time.', + 'duration': 60.046, + 'upload_date': '20151012', + 'timestamp': 1444694160, + }, + } + + +class RteRadioIE(RteBaseIE): + IE_NAME = 'rte:radio' + IE_DESC = 'Raidió Teilifís Éireann radio' + # Radioplayer URLs have two distinct specifier formats, + # the old format #!rii=<channel_id>:<id>:<playable_item_id>:<date>: + # the new format #!rii=b<channel_id>_<id>_<playable_item_id>_<date>_ + # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated. + # An <id> uniquely defines an individual recording, and is the only part we require. + _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:b?[0-9]*)(?:%3A|:|%5F|_)(?P<id>[0-9]+)' + + _TESTS = [{ + # Old-style player URL; HLS and RTMPE formats + 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:', + 'md5': 'c79ccb2c195998440065456b69760411', + 'info_dict': { + 'id': '10507902', + 'ext': 'mp4', + 'title': 'Gloria', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0', + 'timestamp': 1451203200, + 'upload_date': '20151227', + 'duration': 7230.0, + }, + }, { + # New-style player URL; RTMPE formats only + 'url': 'http://rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=b16_3250678_8861_06-04-2012_', + 'info_dict': { + 'id': '3250678', + 'ext': 'flv', + 'title': 'The Lyric Concert with Paul Herriott', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': '', + 'timestamp': 1333742400, + 'upload_date': '20120406', + 'duration': 7199.016, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }] diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index de004671d..721ee733c 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -1,8 +1,10 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re + from .common import InfoExtractor +from ..utils import int_or_none class RTL2IE(InfoExtractor): @@ -13,7 +15,7 @@ class RTL2IE(InfoExtractor): 'id': 'folge-203-0', 'ext': 'f4v', 'title': 'GRIP sucht den Sommerkönig', - 'description': 'Matthias, Det und Helge treten gegeneinander an.' + 'description': 'md5:e3adbb940fd3c6e76fa341b8748b562f' }, 'params': { # rtmp download @@ -25,7 +27,7 @@ class RTL2IE(InfoExtractor): 'id': '21040-anna-erwischt-alex', 'ext': 'mp4', 'title': 'Anna erwischt Alex!', - 'description': 'Anna ist Alex\' Tochter bei Köln 50667.' + 'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' }, 'params': { # rtmp download @@ -52,34 +54,47 @@ class RTL2IE(InfoExtractor): r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') vivi_id = self._html_search_regex( r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') - info_url = 'http://www.rtl2.de/video/php/get_video.php?vico_id=' + vico_id + '&vivi_id=' + vivi_id - info = self._download_json(info_url, video_id) + info = self._download_json( + 'http://www.rtl2.de/sites/default/modules/rtl2/mediathek/php/get_video_jw.php', + video_id, query={ + 'vico_id': vico_id, + 'vivi_id': vivi_id, + }) video_info = info['video'] title = video_info['titel'] - description = video_info.get('beschreibung') - thumbnail = video_info.get('image') - download_url = video_info['streamurl'] - download_url = download_url.replace('\\', '') - stream_url = 'mp4:' + self._html_search_regex(r'ondemand/(.*)', download_url, 'stream URL') - rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] + formats = [] + + rtmp_url = video_info.get('streamurl') + if rtmp_url: + rtmp_url = rtmp_url.replace('\\', '') + stream_url = 'mp4:' + self._html_search_regex(r'/ondemand/(.+)', rtmp_url, 'stream URL') + rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] + + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': stream_url, + 'player_url': 'http://www.rtl2.de/flashplayer/vipo_player.swf', + 'page_url': url, + 'flash_version': 'LNX 11,2,202,429', + 'rtmp_conn': rtmp_conn, + 'no_resume': True, + 'preference': 1, + }) + + m3u8_url = video_info.get('streamurl_hls') + if m3u8_url: + formats.extend(self._extract_akamai_formats(m3u8_url, video_id)) - formats = [{ - 'url': download_url, - 'play_path': stream_url, - 'player_url': 'http://www.rtl2.de/flashplayer/vipo_player.swf', - 'page_url': url, - 'flash_version': 'LNX 11,2,202,429', - 'rtmp_conn': rtmp_conn, - 'no_resume': True, - }] self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, - 'description': description, + 'thumbnail': video_info.get('image'), + 'description': video_info.get('beschreibung'), + 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 4d612b5e3..54076de28 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -14,7 +14,7 @@ class RtlNlIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?:www\.)? (?: - rtlxl\.nl/\#!/[^/]+/| + rtlxl\.nl/[^\#]*\#!/[^/]+/| rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid= ) (?P<id>[0-9a-f-]+)''' @@ -40,7 +40,7 @@ class RtlNlIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1424039400, 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', - 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', + 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', 'upload_date': '20150215', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } @@ -52,7 +52,7 @@ class RtlNlIE(InfoExtractor): 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', 'ext': 'mp4', 'title': 'RTL Nieuws - Meer beelden van overval juwelier', - 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', 'timestamp': 1437233400, 'upload_date': '20150718', 'duration': 30.474, @@ -67,6 +67,9 @@ class RtlNlIE(InfoExtractor): }, { 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', 'only_matching': True, + }, { + 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 82b323cdd..533ee27cb 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -16,7 +16,7 @@ class RTPIE(InfoExtractor): 'ext': 'mp3', 'title': 'Paixões Cruzadas', 'description': 'As paixões musicais de António Cartaxo e António Macedo', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', }, 'params': { # rtmp download diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 3cc32847b..48f17b828 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -4,27 +4,24 @@ from __future__ import unicode_literals import re from .srgssr import SRGSSRIE -from ..compat import ( - compat_str, - compat_urllib_parse_urlparse, -) +from ..compat import compat_str from ..utils import ( int_or_none, parse_duration, parse_iso8601, unescapeHTML, - xpath_text, + determine_ext, ) class RTSIE(SRGSSRIE): IE_DESC = 'RTS.ch' - _VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:www\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html' + _VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html' _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', - 'md5': 'f254c4b26fb1d3c183793d52bc40d3e7', + 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', @@ -35,38 +32,20 @@ class RTSIE(SRGSSRIE): 'uploader': 'Divers', 'upload_date': '19680921', 'timestamp': -40280400, - 'thumbnail': 're:^https?://.*\.image', + 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', - 'md5': 'f1077ac5af686c76528dc8d7c5df29ba', 'info_dict': { - 'id': '5742494', - 'display_id': '5742494', - 'ext': 'mp4', - 'duration': 3720, - 'title': 'Les yeux dans les cieux - Mon homard au Canada', - 'description': 'md5:d22ee46f5cc5bac0912e5a0c6d44a9f7', - 'uploader': 'Passe-moi les jumelles', - 'upload_date': '20140404', - 'timestamp': 1396635300, - 'thumbnail': 're:^https?://.*\.image', - 'view_count': int, + 'id': '5624065', + 'title': 'Passe-moi les jumelles', }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'playlist_mincount': 4, }, { 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', - 'md5': 'b4326fecd3eb64a458ba73c73e91299d', 'info_dict': { 'id': '5745975', 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', @@ -77,14 +56,18 @@ class RTSIE(SRGSSRIE): 'uploader': 'Hockey', 'upload_date': '20140403', 'timestamp': 1396556882, - 'thumbnail': 're:^https?://.*\.image', + 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', - 'md5': '9f713382f15322181bb366cc8c3a4ff0', + 'md5': '1bae984fe7b1f78e94abc74e802ed99f', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', @@ -92,16 +75,12 @@ class RTSIE(SRGSSRIE): 'duration': 33, 'title': 'Londres cachée par un épais smog', 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', - 'uploader': 'Le Journal en continu', + 'uploader': 'L\'actu en vidéo', 'upload_date': '20140403', 'timestamp': 1396537322, - 'thumbnail': 're:^https?://.*\.image', + 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', @@ -125,6 +104,10 @@ class RTSIE(SRGSSRIE): 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', }, 'playlist_mincount': 5, + }, + { + 'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html', + 'only_matching': True, } ] @@ -142,19 +125,32 @@ class RTSIE(SRGSSRIE): # media_id extracted out of URL is not always a real id if 'video' not in all_info and 'audio' not in all_info: - page = self._download_webpage(url, display_id) + entries = [] - # article with videos on rhs - videos = re.findall( - r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:([^"]+)"', - page) - if not videos: + for item in all_info.get('items', []): + item_url = item.get('url') + if not item_url: + continue + entries.append(self.url_result(item_url, 'RTS')) + + if not entries: + page, urlh = self._download_webpage_handle(url, display_id) + if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id: + return self.url_result(urlh.geturl(), 'RTS') + + # article with videos on rhs videos = re.findall( - r'(?s)<iframe[^>]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', + r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:([^"]+)"', page) - if videos: - entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] - return self.playlist_result(entries, media_id, self._og_search_title(page)) + if not videos: + videos = re.findall( + r'(?s)<iframe[^>]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', + page) + if videos: + entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] + + if entries: + return self.playlist_result(entries, media_id, all_info.get('title')) internal_id = self._html_search_regex( r'<(?:video|audio) data-id="([0-9]+)"', page, @@ -168,36 +164,29 @@ class RTSIE(SRGSSRIE): info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] - upload_timestamp = parse_iso8601(info.get('broadcast_date')) - duration = info.get('duration') or info.get('cutout') or info.get('cutduration') - if isinstance(duration, compat_str): - duration = parse_duration(duration) - view_count = info.get('plays') - thumbnail = unescapeHTML(info.get('preview_image_url')) + title = info['title'] def extract_bitrate(url): return int_or_none(self._search_regex( r'-([0-9]+)k\.', url, 'bitrate', default=None)) formats = [] - for format_id, format_url in info['streams'].items(): - if format_id == 'hds_sd' and 'hds' in info['streams']: + streams = info.get('streams', {}) + for format_id, format_url in streams.items(): + if format_id == 'hds_sd' and 'hds' in streams: continue - if format_id == 'hls_sd' and 'hls' in info['streams']: + if format_id == 'hls_sd' and 'hls' in streams: continue - if format_url.endswith('.f4m'): - token = self._download_xml( - 'http://tp.srgssr.ch/token/akahd.xml?stream=%s/*' % compat_urllib_parse_urlparse(format_url).path, - media_id, 'Downloading %s token' % format_id) - auth_params = xpath_text(token, './/authparams', 'auth params') - if not auth_params: - continue - formats.extend(self._extract_f4m_formats( - '%s?%s&hdcore=3.4.0&plugin=aasp-3.4.0.132.66' % (format_url, auth_params), - media_id, f4m_id=format_id, fatal=False)) - elif format_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats( - format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + ext = determine_ext(format_url) + if ext in ('m3u8', 'f4m'): + format_url = self._get_tokenized_src(format_url, media_id, format_id) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, @@ -205,25 +194,37 @@ class RTSIE(SRGSSRIE): 'tbr': extract_bitrate(format_url), }) - if 'media' in info: - formats.extend([{ - 'format_id': '%s-%sk' % (media['ext'], media['rate']), - 'url': 'http://download-video.rts.ch/%s' % media['url'], - 'tbr': media['rate'] or extract_bitrate(media['url']), - } for media in info['media'] if media.get('rate')]) + for media in info.get('media', []): + media_url = media.get('url') + if not media_url or re.match(r'https?://', media_url): + continue + rate = media.get('rate') + ext = media.get('ext') or determine_ext(media_url, 'mp4') + format_id = ext + if rate: + format_id += '-%dk' % rate + formats.append({ + 'format_id': format_id, + 'url': 'http://download-video.rts.ch/' + media_url, + 'tbr': rate or extract_bitrate(media_url), + }) self._check_formats(formats, media_id) self._sort_formats(formats) + duration = info.get('duration') or info.get('cutout') or info.get('cutduration') + if isinstance(duration, compat_str): + duration = parse_duration(duration) + return { 'id': media_id, 'display_id': display_id, 'formats': formats, - 'title': info['title'], + 'title': title, 'description': info.get('intro'), 'duration': duration, - 'view_count': view_count, + 'view_count': int_or_none(info.get('plays')), 'uploader': info.get('programName'), - 'timestamp': upload_timestamp, - 'thumbnail': thumbnail, + 'timestamp': parse_iso8601(info.get('broadcast_date')), + 'thumbnail': unescapeHTML(info.get('preview_image_url')), } diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 34f9c4a99..746677a24 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import base64 @@ -64,7 +64,7 @@ def _decrypt_url(png): class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' - _VALID_URL = r'https?://www\.rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', @@ -184,7 +184,7 @@ class RTVEInfantilIE(InfoExtractor): class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'https?://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', @@ -209,7 +209,10 @@ class RTVELiveIE(InfoExtractor): title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( - r'playerId=player([0-9]+)', webpage, 'internal video ID') + (r'playerId=player([0-9]+)', + r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', + r'data-id=["\'](\d+)'), + webpage, 'internal video ID') png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id png = self._download_webpage(png_url, video_id, 'Downloading url information') m3u8_url = _decrypt_url(png) @@ -226,7 +229,7 @@ class RTVELiveIE(InfoExtractor): class RTVETelevisionIE(InfoExtractor): IE_NAME = 'rtve.es:television' - _VALID_URL = r'https?://www\.rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml' _TEST = { 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index f6454c6b0..6a00f7007 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -14,7 +14,7 @@ class RTVNHIE(InfoExtractor): 'id': '131946', 'ext': 'mp4', 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw', - 'thumbnail': 're:^https?:.*\.jpg$' + 'thumbnail': r're:^https?:.*\.jpg$' } } diff --git a/youtube_dl/extractor/rudo.py b/youtube_dl/extractor/rudo.py index 38366b784..3bfe934d8 100644 --- a/youtube_dl/extractor/rudo.py +++ b/youtube_dl/extractor/rudo.py @@ -28,7 +28,7 @@ class RudoIE(JWPlatformBaseIE): @classmethod def _extract_url(self, webpage): mobj = re.search( - '<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', + r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', webpage) if mobj: return mobj.group('url') @@ -43,7 +43,7 @@ class RudoIE(JWPlatformBaseIE): transform_source=lambda s: js_to_json(re.sub(r'encodeURI\([^)]+\)', '""', s))) info_dict = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False, m3u8_id='hls') + jwplayer_data, video_id, require_title=False, m3u8_id='hls', mpd_id='dash') info_dict.update({ 'title': self._og_search_title(webpage), diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py index 1f7c26299..2b830cf47 100644 --- a/youtube_dl/extractor/ruhd.py +++ b/youtube_dl/extractor/ruhd.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -14,7 +14,7 @@ class RUHDIE(InfoExtractor): 'ext': 'divx', 'title': 'КОТ бааааам', 'description': 'классный кот)', - 'thumbnail': 're:^http://.*\.jpg$', + 'thumbnail': r're:^http://.*\.jpg$', } } diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 9ca4ae147..fd1df925b 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -88,7 +88,7 @@ class RutubeIE(InfoExtractor): class RutubeEmbedIE(InfoExtractor): IE_NAME = 'rutube:embed' IE_DESC = 'Rutube embedded videos' - _VALID_URL = 'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)' + _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index a2379eb04..a5e672c0a 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index ffea438cc..20d01754a 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, + ExtractorError, int_or_none, xpath_attr, xpath_text, @@ -12,7 +13,7 @@ from ..utils import ( class RuutuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ruutu\.fi/video/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P<id>\d+)' _TESTS = [ { 'url': 'http://www.ruutu.fi/video/2058907', @@ -22,7 +23,7 @@ class RuutuIE(InfoExtractor): 'ext': 'mp4', 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 114, 'age_limit': 0, }, @@ -34,12 +35,24 @@ class RuutuIE(InfoExtractor): 'id': '2057306', 'ext': 'mp4', 'title': 'Superpesis: katso koko kausi Ruudussa', - 'description': 'md5:da2736052fef3b2bd5e0005e63c25eac', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:bfb7336df2a12dc21d18fa696c9f8f23', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 40, 'age_limit': 0, }, }, + { + 'url': 'http://www.supla.fi/supla/2231370', + 'md5': 'df14e782d49a2c0df03d3be2a54ef949', + 'info_dict': { + 'id': '2231370', + 'ext': 'mp4', + 'title': 'Osa 1: Mikael Jungner', + 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + }, + }, ] def _real_extract(self, url): @@ -68,6 +81,9 @@ class RuutuIE(InfoExtractor): elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) else: proto = compat_urllib_parse_urlparse(video_url).scheme if not child.tag.startswith('HTTP') and proto != 'rtmp': @@ -89,6 +105,11 @@ class RuutuIE(InfoExtractor): }) extract_formats(video_xml.find('./Clip')) + + drm = xpath_text(video_xml, './Clip/DRM', default=None) + if not formats and drm: + raise ExtractorError('This video is DRM protected.', expected=True) + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 08ddbe3c4..c3aec1edd 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -103,13 +103,13 @@ class SafariIE(SafariBaseIE): webpage = self._download_webpage(url, video_id) reference_id = self._search_regex( - r'data-reference-id=(["\'])(?P<id>.+?)\1', + r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'kaltura reference id', group='id') partner_id = self._search_regex( - r'data-partner-id=(["\'])(?P<id>.+?)\1', + r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'kaltura widget id', group='id') ui_id = self._search_regex( - r'data-ui-id=(["\'])(?P<id>.+?)\1', + r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'kaltura uiconf id', group='id') query = { @@ -157,7 +157,14 @@ class SafariCourseIE(SafariBaseIE): IE_NAME = 'safari:course' IE_DESC = 'safaribooksonline.com online courses' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)| + techbus\.safaribooksonline\.com + ) + /(?P<id>[^/]+)/?(?:[#?]|$) + ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', @@ -170,6 +177,9 @@ class SafariCourseIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', 'only_matching': True, + }, { + 'url': 'http://techbus.safaribooksonline.com/9780134426365', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/sapo.py b/youtube_dl/extractor/sapo.py index 172cc1275..49a9b313a 100644 --- a/youtube_dl/extractor/sapo.py +++ b/youtube_dl/extractor/sapo.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py index 5b7367b94..30f9cf824 100644 --- a/youtube_dl/extractor/savefrom.py +++ b/youtube_dl/extractor/savefrom.py @@ -20,7 +20,7 @@ class SaveFromIE(InfoExtractor): 'upload_date': '20120816', 'uploader': 'Howcast', 'uploader_id': 'Howcast', - 'description': 're:(?s).* Hi, my name is Rene Dreifuss\. And I\'m here to show you some MMA.*', + 'description': r're:(?s).* Hi, my name is Rene Dreifuss\. And I\'m here to show you some MMA.*', }, 'params': { 'skip_download': True diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index 96472fbc4..845712a76 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -22,7 +22,7 @@ class SBSIE(InfoExtractor): 'ext': 'mp4', 'title': 'Dingo Conservation (The Feed)', 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'duration': 308, 'timestamp': 1408613220, 'upload_date': '20140821', diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index 356631700..62a6a8337 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -21,7 +21,7 @@ class ScreencastIE(InfoExtractor): 'ext': 'm4v', 'title': 'Color Measurement with Ocean Optics Spectrometers', 'description': 'md5:240369cde69d8bed61349a199c5fb153', - 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', } }, { 'url': 'http://www.screencast.com/t/V2uXehPJa1ZI', @@ -31,7 +31,7 @@ class ScreencastIE(InfoExtractor): 'ext': 'mov', 'title': 'The Amadeus Spectrometer', 'description': 're:^In this video, our friends at.*To learn more about Amadeus, visit', - 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', } }, { 'url': 'http://www.screencast.com/t/aAB3iowa', @@ -41,7 +41,7 @@ class ScreencastIE(InfoExtractor): 'ext': 'mp4', 'title': 'Google Earth Export', 'description': 'Provides a demo of a CommunityViz export to Google Earth, one of the 3D viewing options.', - 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', } }, { 'url': 'http://www.screencast.com/t/X3ddTrYh', @@ -51,7 +51,7 @@ class ScreencastIE(InfoExtractor): 'ext': 'wmv', 'title': 'Toolkit 6 User Group Webinar (2014-03-04) - Default Judgment and First Impression', 'description': 'md5:7b9f393bc92af02326a5c5889639eab0', - 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', } }, { 'url': 'http://screencast.com/t/aAB3iowa', diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index 7a88a42cd..94a2a37d2 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -14,7 +14,7 @@ class ScreencastOMaticIE(JWPlatformBaseIE): 'id': 'c2lD3BeOPl', 'ext': 'mp4', 'title': 'Welcome to 3-4 Philosophy @ DECV!', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', 'duration': 369.163, } diff --git a/youtube_dl/extractor/screenjunkies.py b/youtube_dl/extractor/screenjunkies.py deleted file mode 100644 index dd0a6ba19..000000000 --- a/youtube_dl/extractor/screenjunkies.py +++ /dev/null @@ -1,138 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_age_limit, -) - - -class ScreenJunkiesIE(InfoExtractor): - _VALID_URL = r'https?://www.screenjunkies.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' - _TESTS = [{ - 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915', - 'md5': '5c2b686bec3d43de42bde9ec047536b0', - 'info_dict': { - 'id': '2841915', - 'display_id': 'best-quentin-tarantino-movie', - 'ext': 'mp4', - 'title': 'Best Quentin Tarantino Movie', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 3671, - 'age_limit': 13, - 'tags': list, - }, - }, { - 'url': 'http://www.screenjunkies.com/video/honest-trailers-the-dark-knight', - 'info_dict': { - 'id': '2348808', - 'display_id': 'honest-trailers-the-dark-knight', - 'ext': 'mp4', - 'title': "Honest Trailers: 'The Dark Knight'", - 'thumbnail': 're:^https?://.*\.jpg', - 'age_limit': 10, - 'tags': list, - }, - }, { - # requires subscription but worked around - 'url': 'http://www.screenjunkies.com/video/knocking-dead-ep-1-the-show-so-far-3003285', - 'info_dict': { - 'id': '3003285', - 'display_id': 'knocking-dead-ep-1-the-show-so-far', - 'ext': 'mp4', - 'title': 'Knocking Dead Ep 1: State of The Dead Recap', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 3307, - 'age_limit': 13, - 'tags': list, - }, - }] - - _DEFAULT_BITRATES = (48, 150, 496, 864, 2240) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - if not video_id: - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - (r'src=["\']/embed/(\d+)', r'data-video-content-id=["\'](\d+)'), - webpage, 'video id') - - webpage = self._download_webpage( - 'http://www.screenjunkies.com/embed/%s' % video_id, - display_id, 'Downloading video embed page') - embed_vars = self._parse_json( - self._search_regex( - r'(?s)embedVars\s*=\s*({.+?})\s*</script>', webpage, 'embed vars'), - display_id) - - title = embed_vars['contentName'] - - formats = [] - bitrates = [] - for f in embed_vars.get('media', []): - if not f.get('uri') or f.get('mediaPurpose') != 'play': - continue - bitrate = int_or_none(f.get('bitRate')) - if bitrate: - bitrates.append(bitrate) - formats.append({ - 'url': f['uri'], - 'format_id': 'http-%d' % bitrate if bitrate else 'http', - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'tbr': bitrate, - 'format': 'mp4', - }) - - if not bitrates: - # When subscriptionLevel > 0, i.e. plus subscription is required - # media list will be empty. However, hds and hls uris are still - # available. We can grab them assuming bitrates to be default. - bitrates = self._DEFAULT_BITRATES - - auth_token = embed_vars.get('AuthToken') - - def construct_manifest_url(base_url, ext): - pieces = [base_url] - pieces.extend([compat_str(b) for b in bitrates]) - pieces.append('_kbps.mp4.%s?%s' % (ext, auth_token)) - return ','.join(pieces) - - if bitrates and auth_token: - hds_url = embed_vars.get('hdsUri') - if hds_url: - f4m_formats = self._extract_f4m_formats( - construct_manifest_url(hds_url, 'f4m'), - display_id, f4m_id='hds', fatal=False) - if len(f4m_formats) == len(bitrates): - for f, bitrate in zip(f4m_formats, bitrates): - if not f.get('tbr'): - f['format_id'] = 'hds-%d' % bitrate - f['tbr'] = bitrate - # TODO: fix f4m downloader to handle manifests without bitrates if possible - # formats.extend(f4m_formats) - - hls_url = embed_vars.get('hlsUri') - if hls_url: - formats.extend(self._extract_m3u8_formats( - construct_manifest_url(hls_url, 'm3u8'), - display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'thumbnail': embed_vars.get('thumbUri'), - 'duration': int_or_none(embed_vars.get('videoLengthInSeconds')) or None, - 'age_limit': parse_age_limit(embed_vars.get('audienceRating')), - 'tags': embed_vars.get('tags', '').split(','), - 'formats': formats, - } diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py deleted file mode 100644 index 40333c825..000000000 --- a/youtube_dl/extractor/screenwavemedia.py +++ /dev/null @@ -1,146 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - js_to_json, -) - - -class ScreenwaveMediaIE(InfoExtractor): - _VALID_URL = r'(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)' - EMBED_PATTERN = r'src=(["\'])(?P<url>(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1' - _TESTS = [{ - 'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - playerdata = self._download_webpage( - 'http://player.screenwavemedia.com/player.php?id=%s' % video_id, - video_id, 'Downloading player webpage') - - vidtitle = self._search_regex( - r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/') - - playerconfig = self._download_webpage( - 'http://player.screenwavemedia.com/player.js', - video_id, 'Downloading playerconfig webpage') - - videoserver = self._search_regex(r'SWMServer\s*=\s*"([\d\.]+)"', playerdata, 'videoserver') - - sources = self._parse_json( - js_to_json( - re.sub( - r'(?s)/\*.*?\*/', '', - self._search_regex( - r'sources\s*:\s*(\[[^\]]+?\])', playerconfig, - 'sources', - ).replace( - "' + thisObj.options.videoserver + '", - videoserver - ).replace( - "' + playerVidId + '", - video_id - ) - ) - ), - video_id, fatal=False - ) - - # Fallback to hardcoded sources if JS changes again - if not sources: - self.report_warning('Falling back to a hardcoded list of streams') - sources = [{ - 'file': 'http://%s/vod/%s_%s.mp4' % (videoserver, video_id, format_id), - 'type': 'mp4', - 'label': format_label, - } for format_id, format_label in ( - ('low', '144p Low'), ('med', '160p Med'), ('high', '360p High'), ('hd1', '720p HD1'))] - sources.append({ - 'file': 'http://%s/vod/smil:%s.smil/playlist.m3u8' % (videoserver, video_id), - 'type': 'hls', - }) - - formats = [] - for source in sources: - file_ = source.get('file') - if not file_: - continue - if source.get('type') == 'hls': - formats.extend(self._extract_m3u8_formats(file_, video_id, ext='mp4')) - else: - format_id = self._search_regex( - r'_(.+?)\.[^.]+$', file_, 'format id', default=None) - if not self._is_valid_url(file_, video_id, format_id or 'video'): - continue - format_label = source.get('label') - height = int_or_none(self._search_regex( - r'^(\d+)[pP]', format_label, 'height', default=None)) - formats.append({ - 'url': file_, - 'format_id': format_id, - 'format': format_label, - 'ext': source.get('type'), - 'height': height, - }) - self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) - - return { - 'id': video_id, - 'title': vidtitle, - 'formats': formats, - } - - -class TeamFourIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/video/(?P<id>[a-z0-9\-]+)/?' - _TEST = { - 'url': 'http://teamfourstar.com/video/a-moment-with-tfs-episode-4/', - 'info_dict': { - 'id': 'TeamFourStar-5292a02f20bfa', - 'ext': 'mp4', - 'upload_date': '20130401', - 'description': 'Check out this and more on our website: http://teamfourstar.com\nTFS Store: http://sharkrobot.com/team-four-star\nFollow on Twitter: http://twitter.com/teamfourstar\nLike on FB: http://facebook.com/teamfourstar', - 'title': 'A Moment With TFS Episode 4', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - playerdata_url = self._search_regex( - r'src="(http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', - webpage, 'player data URL') - - video_title = self._html_search_regex( - r'<div class="heroheadingtitle">(?P<title>.+?)</div>', - webpage, 'title') - video_date = unified_strdate(self._html_search_regex( - r'<div class="heroheadingdate">(?P<date>.+?)</div>', - webpage, 'date', fatal=False)) - video_description = self._html_search_regex( - r'(?s)<div class="postcontent">(?P<description>.+?)</div>', - webpage, 'description', fatal=False) - video_thumbnail = self._og_search_thumbnail(webpage) - - return { - '_type': 'url_transparent', - 'display_id': display_id, - 'title': video_title, - 'description': video_description, - 'upload_date': video_date, - 'thumbnail': video_thumbnail, - 'url': playerdata_url, - } diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index c5f474dd1..387a4f7f6 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -48,14 +48,14 @@ class SenateISVPIE(InfoExtractor): ['arch', '', 'http://ussenate-f.akamaihd.net/'] ] _IE_NAME = 'senate.gov' - _VALID_URL = r'https?://www\.senate\.gov/isvp/?\?(?P<qs>.+)' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { 'id': 'judiciary031715', 'ext': 'mp4', 'title': 'Integrated Senate Video Player', - 'thumbnail': 're:^https?://.*\.(?:jpg|png)$', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py index 1c636f672..9880a5a78 100644 --- a/youtube_dl/extractor/sendtonews.py +++ b/youtube_dl/extractor/sendtonews.py @@ -4,33 +4,46 @@ from __future__ import unicode_literals import re from .jwplatform import JWPlatformBaseIE -from ..compat import compat_parse_qs from ..utils import ( - ExtractorError, - parse_duration, + float_or_none, + parse_iso8601, + update_url_query, + int_or_none, + determine_protocol, + unescapeHTML, ) class SendtoNewsIE(JWPlatformBaseIE): - _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P<query>[^#]+)' + _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P<id>[0-9A-Za-z-]+)' _TEST = { # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ - 'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes', + 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', 'info_dict': { - 'id': 'GxfCe0Zo7D-175909-5588', - 'ext': 'mp4', - 'title': 'Recap: CLE 15, CIN 6', - 'description': '5/16/16: Indians\' bats explode for 15 runs in a win', - 'duration': 49, + 'id': 'GxfCe0Zo7D-175909-5588' }, + 'playlist_count': 8, + # test the first video only to prevent lengthy tests + 'playlist': [{ + 'info_dict': { + 'id': '240385', + 'ext': 'mp4', + 'title': 'Indians introduce Encarnacion', + 'description': 'Indians president of baseball operations Chris Antonetti and Edwin Encarnacion discuss the slugger\'s three-year contract with Cleveland', + 'duration': 137.898, + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20170105', + 'timestamp': 1483649762, + }, + }], 'params': { # m3u8 download 'skip_download': True, }, } - _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s' + _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' @classmethod def _extract_url(cls, webpage): @@ -39,48 +52,54 @@ class SendtoNewsIE(JWPlatformBaseIE): .*\bSC=(?P<SC>[0-9a-zA-Z-]+).* \1>''', webpage) if mobj: - sk, mk, pk = mobj.group('SC').split('-') - return cls._URL_TEMPLATE % (sk, mk, pk) + sc = mobj.group('SC') + return cls._URL_TEMPLATE % sc def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - params = compat_parse_qs(mobj.group('query')) + playlist_id = self._match_id(url) - if 'SK' not in params or 'MK' not in params or 'PK' not in params: - raise ExtractorError('Invalid URL', expected=True) + data_url = update_url_query( + url.replace('embedplayer.php', 'data_read.php'), + {'cmd': 'loadInitial'}) + playlist_data = self._download_json(data_url, playlist_id) - video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]]) + entries = [] + for video in playlist_data['playlistData'][0]: + info_dict = self._parse_jwplayer_data( + video['jwconfiguration'], + require_title=False, m3u8_id='hls', rtmp_params={'no_resume': True}) - webpage = self._download_webpage(url, video_id) + for f in info_dict['formats']: + if f.get('tbr'): + continue + tbr = int_or_none(self._search_regex( + r'/(\d+)k/', f['url'], 'bitrate', default=None)) + if not tbr: + continue + f.update({ + 'format_id': '%s-%d' % (determine_protocol(f), tbr), + 'tbr': tbr, + }) + self._sort_formats(info_dict['formats'], ('tbr', 'height', 'width', 'format_id')) - jwplayer_data_str = self._search_regex( - r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data') - js_vars = { - 'w': 1024, - 'h': 768, - 'modeVar': 'html5', - } - for name, val in js_vars.items(): - js_val = '%d' % val if isinstance(val, int) else '"%s"' % val - jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val) + thumbnails = [] + if video.get('thumbnailUrl'): + thumbnails.append({ + 'id': 'normal', + 'url': video['thumbnailUrl'], + }) + if video.get('smThumbnailUrl'): + thumbnails.append({ + 'id': 'small', + 'url': video['smThumbnailUrl'], + }) + info_dict.update({ + 'title': video['S_headLine'].strip(), + 'description': unescapeHTML(video.get('S_fullStory')), + 'thumbnails': thumbnails, + 'duration': float_or_none(video.get('SM_length')), + 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), + }) + entries.append(info_dict) - info_dict = self._parse_jwplayer_data( - self._parse_json(jwplayer_data_str, video_id), - video_id, require_title=False, rtmp_params={'no_resume': True}) - - title = self._html_search_regex( - r'<div[^>]+class="embedTitle">([^<]+)</div>', webpage, 'title') - description = self._html_search_regex( - r'<div[^>]+class="embedSubTitle">([^<]+)</div>', webpage, - 'description', fatal=False) - duration = parse_duration(self._html_search_regex( - r'<div[^>]+class="embedDetails">([0-9:]+)', webpage, - 'duration', fatal=False)) - - info_dict.update({ - 'title': title, - 'description': description, - 'duration': duration, - }) - - return info_dict + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py index a99b2a8e7..5e22ea730 100644 --- a/youtube_dl/extractor/sexu.py +++ b/youtube_dl/extractor/sexu.py @@ -14,7 +14,7 @@ class SexuIE(InfoExtractor): 'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b', 'description': 'md5:2b75327061310a3afb3fbd7d09e2e403', 'categories': list, # NSFW - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, } } diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index ca286abb1..62d41e88a 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -1,17 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals +import re +import json + from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, parse_iso8601, str_or_none, + urlencode_postdata, + clean_html, ) class ShahidIE(InfoExtractor): - _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P<id>\d+)/?' + _NETRC_MACHINE = 'shahid' + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?P<type>episode|movie)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', 'info_dict': { @@ -27,18 +34,54 @@ class ShahidIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + 'url': 'https://shahid.mbc.net/ar/movie/151746/%D8%A7%D9%84%D9%82%D9%86%D8%A7%D8%B5%D8%A9.html', + 'only_matching': True }, { # shahid plus subscriber only 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', 'only_matching': True }] - def _call_api(self, path, video_id, note): - data = self._download_json( - 'http://api.shahid.net/api/v1_1/' + path, video_id, note, query={ - 'apiKey': 'sh@hid0nlin3', - 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', - }).get('data', {}) + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + try: + user_data = self._download_json( + 'https://shahid.mbc.net/wd/service/users/login', + None, 'Logging in', data=json.dumps({ + 'email': email, + 'password': password, + 'basic': 'false', + }).encode('utf-8'), headers={ + 'Content-Type': 'application/json; charset=UTF-8', + })['user'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + fail_data = self._parse_json( + e.cause.read().decode('utf-8'), None, fatal=False) + if fail_data: + faults = fail_data.get('faults', []) + faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) + if faults_message: + raise ExtractorError(faults_message, expected=True) + raise + + self._download_webpage( + 'https://shahid.mbc.net/populateContext', + None, 'Populate Context', data=urlencode_postdata({ + 'firstName': user_data['firstName'], + 'lastName': user_data['lastName'], + 'userName': user_data['email'], + 'csg_user_name': user_data['email'], + 'subscriberId': user_data['id'], + 'sessionId': user_data['sessionId'], + })) + + def _get_api_data(self, response): + data = response.get('data', {}) error = data.get('error') if error: @@ -49,11 +92,11 @@ class ShahidIE(InfoExtractor): return data def _real_extract(self, url): - video_id = self._match_id(url) + page_type, video_id = re.match(self._VALID_URL, url).groups() - player = self._call_api( - 'Content/Episode/%s' % video_id, - video_id, 'Downloading player JSON') + player = self._get_api_data(self._download_json( + 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-player.html' % video_id, + video_id, 'Downloading player JSON')) if player.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) @@ -61,9 +104,12 @@ class ShahidIE(InfoExtractor): formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') self._sort_formats(formats) - video = self._call_api( - 'episode/%s' % video_id, video_id, - 'Downloading video JSON')['episode'] + video = self._get_api_data(self._download_json( + 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), + video_id, 'Downloading video JSON', query={ + 'apiKey': 'sh@hid0nlin3', + 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', + }))[page_type] title = video['title'] categories = [ diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index d592dfeb8..89e19e927 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -10,11 +10,38 @@ from ..utils import ( ) -class SharedIE(InfoExtractor): - IE_DESC = 'shared.sx and vivo.sx' - _VALID_URL = r'https?://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})' +class SharedBaseIE(InfoExtractor): + def _real_extract(self, url): + video_id = self._match_id(url) - _TESTS = [{ + webpage, urlh = self._download_webpage_handle(url, video_id) + + if self._FILE_NOT_FOUND in webpage: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) + + video_url = self._extract_video_url(webpage, video_id, url) + + title = base64.b64decode(self._html_search_meta( + 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') + filesize = int_or_none(self._html_search_meta( + 'full:size', webpage, 'file size', fatal=False)) + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'filesize': filesize, + 'title': title, + } + + +class SharedIE(SharedBaseIE): + IE_DESC = 'shared.sx' + _VALID_URL = r'https?://shared\.sx/(?P<id>[\da-z]{10})' + _FILE_NOT_FOUND = '>File does not exist<' + + _TEST = { 'url': 'http://shared.sx/0060718775', 'md5': '106fefed92a8a2adb8c98e6a0652f49b', 'info_dict': { @@ -23,7 +50,32 @@ class SharedIE(InfoExtractor): 'title': 'Bmp4', 'filesize': 1720110, }, - }, { + } + + def _extract_video_url(self, webpage, video_id, url): + download_form = self._hidden_inputs(webpage) + + video_page = self._download_webpage( + url, video_id, 'Downloading video page', + data=urlencode_postdata(download_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }) + + video_url = self._html_search_regex( + r'data-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + video_page, 'video URL', group='url') + + return video_url + + +class VivoIE(SharedBaseIE): + IE_DESC = 'vivo.sx' + _VALID_URL = r'https?://vivo\.sx/(?P<id>[\da-z]{10})' + _FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed' + + _TEST = { 'url': 'http://vivo.sx/d7ddda0e78', 'md5': '15b3af41be0b4fe01f4df075c2678b2c', 'info_dict': { @@ -32,43 +84,13 @@ class SharedIE(InfoExtractor): 'title': 'Chicken', 'filesize': 528031, }, - }] + } - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage, urlh = self._download_webpage_handle(url, video_id) - - if '>File does not exist<' in webpage: - raise ExtractorError( - 'Video %s does not exist' % video_id, expected=True) - - download_form = self._hidden_inputs(webpage) - - video_page = self._download_webpage( - urlh.geturl(), video_id, 'Downloading video page', - data=urlencode_postdata(download_form), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': urlh.geturl(), - }) - - video_url = self._html_search_regex( - r'data-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - video_page, 'video URL', group='url') - title = base64.b64decode(self._html_search_meta( - 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') - filesize = int_or_none(self._html_search_meta( - 'full:size', webpage, 'file size', fatal=False)) - thumbnail = self._html_search_regex( - r'data-poster=(["\'])(?P<url>(?:(?!\1).)+)\1', - video_page, 'thumbnail', default=None, group='url') - - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'filesize': filesize, - 'title': title, - 'thumbnail': thumbnail, - } + def _extract_video_url(self, webpage, video_id, *args): + return self._parse_json( + self._search_regex( + r'InitializeStream\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'stream', group='url'), + video_id, + transform_source=lambda x: base64.b64decode( + x.encode('ascii')).decode('utf-8'))[0] diff --git a/youtube_dl/extractor/sharesix.py b/youtube_dl/extractor/sharesix.py deleted file mode 100644 index 9cce5ceb4..000000000 --- a/youtube_dl/extractor/sharesix.py +++ /dev/null @@ -1,91 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - sanitized_Request, - urlencode_postdata, -) - - -class ShareSixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sharesix\.com/(?:f/)?(?P<id>[0-9a-zA-Z]+)' - _TESTS = [ - { - 'url': 'http://sharesix.com/f/OXjQ7Y6', - 'md5': '9e8e95d8823942815a7d7c773110cc93', - 'info_dict': { - 'id': 'OXjQ7Y6', - 'ext': 'mp4', - 'title': 'big_buck_bunny_480p_surround-fix.avi', - 'duration': 596, - 'width': 854, - 'height': 480, - }, - }, - { - 'url': 'http://sharesix.com/lfrwoxp35zdd', - 'md5': 'dd19f1435b7cec2d7912c64beeee8185', - 'info_dict': { - 'id': 'lfrwoxp35zdd', - 'ext': 'flv', - 'title': 'WhiteBoard___a_Mac_vs_PC_Parody_Cartoon.mp4.flv', - 'duration': 65, - 'width': 1280, - 'height': 720, - }, - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - fields = { - 'method_free': 'Free' - } - post = urlencode_postdata(fields) - req = sanitized_Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - - webpage = self._download_webpage(req, video_id, - 'Downloading video page') - - video_url = self._search_regex( - r"var\slnk1\s=\s'([^']+)'", webpage, 'video URL') - title = self._html_search_regex( - r'(?s)<dt>Filename:</dt>.+?<dd>(.+?)</dd>', webpage, 'title') - duration = parse_duration( - self._search_regex( - r'(?s)<dt>Length:</dt>.+?<dd>(.+?)</dd>', - webpage, - 'duration', - fatal=False - ) - ) - - m = re.search( - r'''(?xs)<dt>Width\sx\sHeight</dt>.+? - <dd>(?P<width>\d+)\sx\s(?P<height>\d+)</dd>''', - webpage - ) - width = height = None - if m: - width, height = int(m.group('width')), int(m.group('height')) - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'width': width, - 'height': height, - }] - - return { - 'id': video_id, - 'title': title, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/showroomlive.py b/youtube_dl/extractor/showroomlive.py new file mode 100644 index 000000000..efd9d561f --- /dev/null +++ b/youtube_dl/extractor/showroomlive.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + urljoin, +) + + +class ShowRoomLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?showroom-live\.com/(?!onlive|timetable|event|campaign|news|ranking|room)(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.showroom-live.com/48_Nana_Okada', + 'only_matching': True, + } + + def _real_extract(self, url): + broadcaster_id = self._match_id(url) + + webpage = self._download_webpage(url, broadcaster_id) + + room_id = self._search_regex( + (r'SrGlobal\.roomId\s*=\s*(\d+)', + r'(?:profile|room)\?room_id\=(\d+)'), webpage, 'room_id') + + room = self._download_json( + urljoin(url, '/api/room/profile?room_id=%s' % room_id), + broadcaster_id) + + is_live = room.get('is_onlive') + if is_live is not True: + raise ExtractorError('%s is offline' % broadcaster_id, expected=True) + + uploader = room.get('performer_name') or broadcaster_id + title = room.get('room_name') or room.get('main_name') or uploader + + streaming_url_list = self._download_json( + urljoin(url, '/api/live/streaming_url?room_id=%s' % room_id), + broadcaster_id)['streaming_url_list'] + + formats = [] + for stream in streaming_url_list: + stream_url = stream.get('url') + if not stream_url: + continue + stream_type = stream.get('type') + if stream_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + stream_url, broadcaster_id, ext='mp4', m3u8_id='hls', + live=True) + for f in m3u8_formats: + f['quality'] = int_or_none(stream.get('quality', 100)) + formats.extend(m3u8_formats) + elif stream_type == 'rtmp': + stream_name = stream.get('stream_name') + if not stream_name: + continue + formats.append({ + 'url': stream_url, + 'play_path': stream_name, + 'page_url': url, + 'player_url': 'https://www.showroom-live.com/assets/swf/v3/ShowRoomLive.swf', + 'rtmp_live': True, + 'ext': 'flv', + 'format_id': 'rtmp', + 'format_note': stream.get('label'), + 'quality': int_or_none(stream.get('quality', 100)), + }) + self._sort_formats(formats) + + return { + 'id': compat_str(room.get('live_id') or broadcaster_id), + 'title': self._live_title(title), + 'description': room.get('description'), + 'timestamp': int_or_none(room.get('current_live_started_at')), + 'uploader': uploader, + 'uploader_id': broadcaster_id, + 'view_count': int_or_none(room.get('view_num')), + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py index 9dc78c7d2..4ca9f6b3c 100644 --- a/youtube_dl/extractor/skysports.py +++ b/youtube_dl/extractor/skysports.py @@ -2,18 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import strip_or_none class SkySportsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', - 'md5': 'c44a1db29f27daf9a0003e010af82100', + 'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec', 'info_dict': { 'id': '10328419', - 'ext': 'flv', - 'title': 'Bale: Its our time to shine', - 'description': 'md5:9fd1de3614d525f5addda32ac3c482c9', + 'ext': 'mp4', + 'title': 'Bale: It\'s our time to shine', + 'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d', }, 'add_ie': ['Ooyala'], } @@ -28,6 +29,6 @@ class SkySportsIE(InfoExtractor): 'url': 'ooyala:%s' % self._search_regex( r'data-video-id="([^"]+)"', webpage, 'ooyala id'), 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), + 'description': strip_or_none(self._og_search_description(webpage)), 'ie_key': 'Ooyala', } diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 4967c1b77..74a1dc672 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -14,7 +14,7 @@ from ..utils import ( class SlideshareIE(InfoExtractor): - _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' + _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' _TEST = { 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py index 7efb29f65..7145d285a 100644 --- a/youtube_dl/extractor/slutload.py +++ b/youtube_dl/extractor/slutload.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -9,20 +7,18 @@ class SlutloadIE(InfoExtractor): _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$' _TEST = { 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', - 'md5': '0cf531ae8006b530bd9df947a6a0df77', + 'md5': '868309628ba00fd488cf516a113fd717', 'info_dict': { 'id': 'TD73btpBqSxc', 'ext': 'mp4', 'title': 'virginie baisee en cam', 'age_limit': 18, - 'thumbnail': 're:https?://.*?\.jpg' + 'thumbnail': r're:https?://.*?\.jpg' } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>', diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 114358786..370fa8879 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -81,7 +81,7 @@ class SmotriIE(InfoExtractor): 'uploader': 'psavari1', 'uploader_id': 'psavari1', 'upload_date': '20081103', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'videopassword': '223322', @@ -117,7 +117,7 @@ class SmotriIE(InfoExtractor): 'uploader': 'вАся', 'uploader_id': 'asya_prosto', 'upload_date': '20081218', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 18, }, 'params': { diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py index 0d1ab07f8..f77354748 100644 --- a/youtube_dl/extractor/snotr.py +++ b/youtube_dl/extractor/snotr.py @@ -5,9 +5,9 @@ import re from .common import InfoExtractor from ..utils import ( - float_or_none, - str_to_int, parse_duration, + parse_filesize, + str_to_int, ) @@ -17,21 +17,24 @@ class SnotrIE(InfoExtractor): 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', 'info_dict': { 'id': '13708', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Drone flying through fireworks!', - 'duration': 247, - 'filesize_approx': 98566144, + 'duration': 248, + 'filesize_approx': 40700000, 'description': 'A drone flying through Fourth of July Fireworks', - } + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['description'], }, { 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', 'info_dict': { 'id': '530', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'David Letteman - George W. Bush Top 10', 'duration': 126, - 'filesize_approx': 8912896, + 'filesize_approx': 8500000, 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', + 'thumbnail': r're:^https?://.*\.jpg$', } }] @@ -43,26 +46,28 @@ class SnotrIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) - video_url = 'http://cdn.videos.snotr.com/%s.flv' % video_id + info_dict = self._parse_html5_media_entries( + url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] view_count = str_to_int(self._html_search_regex( - r'<p>\n<strong>Views:</strong>\n([\d,\.]+)</p>', + r'<p[^>]*>\s*<strong[^>]*>Views:</strong>\s*<span[^>]*>([\d,\.]+)', webpage, 'view count', fatal=False)) duration = parse_duration(self._html_search_regex( - r'<p>\n<strong>Length:</strong>\n\s*([0-9:]+).*?</p>', + r'<p[^>]*>\s*<strong[^>]*>Length:</strong>\s*<span[^>]*>([\d:]+)', webpage, 'duration', fatal=False)) - filesize_approx = float_or_none(self._html_search_regex( - r'<p>\n<strong>Filesize:</strong>\n\s*([0-9.]+)\s*megabyte</p>', - webpage, 'filesize', fatal=False), invscale=1024 * 1024) + filesize_approx = parse_filesize(self._html_search_regex( + r'<p[^>]*>\s*<strong[^>]*>Filesize:</strong>\s*<span[^>]*>([^<]+)', + webpage, 'filesize', fatal=False)) - return { + info_dict.update({ 'id': video_id, 'description': description, 'title': title, - 'url': video_url, 'view_count': view_count, 'duration': duration, 'filesize_approx': filesize_approx, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 72fe66142..30760ca06 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -14,10 +14,10 @@ from ..utils import ExtractorError class SohuIE(InfoExtractor): _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' + # Sohu videos give different MD5 sums on Travis CI and my machine _TESTS = [{ 'note': 'This video is available only in Mainland China', 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', - 'md5': '29175c8cadd8b5cc4055001e85d6b372', 'info_dict': { 'id': '382479172', 'ext': 'mp4', @@ -26,7 +26,6 @@ class SohuIE(InfoExtractor): 'skip': 'On available in China', }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', - 'md5': '699060e75cf58858dd47fb9c03c42cfb', 'info_dict': { 'id': '409385080', 'ext': 'mp4', @@ -34,7 +33,6 @@ class SohuIE(InfoExtractor): } }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', - 'md5': '9bf34be48f2f4dadcb226c74127e203c', 'info_dict': { 'id': '78693464', 'ext': 'mp4', @@ -48,7 +46,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', }, 'playlist': [{ - 'md5': 'bdbfb8f39924725e6589c146bc1883ad', 'info_dict': { 'id': '78910339_part1', 'ext': 'mp4', @@ -56,7 +53,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', 'info_dict': { 'id': '78910339_part2', 'ext': 'mp4', @@ -64,7 +60,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '8407e634175fdac706766481b9443450', 'info_dict': { 'id': '78910339_part3', 'ext': 'mp4', diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py new file mode 100644 index 000000000..accd112aa --- /dev/null +++ b/youtube_dl/extractor/sonyliv.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SonyLIVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)' + _TESTS = [{ + 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", + 'info_dict': { + 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", + 'id': '5024612095001', + 'ext': 'mp4', + 'upload_date': '20160707', + 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', + 'uploader_id': '4338955589001', + 'timestamp': 1467870968, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + }, { + 'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', + 'only_matching': True, + }] + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + brightcove_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index aeae931a2..b3aa4ce26 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -32,7 +32,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P<uploader>[\w\d-]+)/ - (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#])) + (?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -53,6 +53,7 @@ class SoundcloudIE(InfoExtractor): 'uploader': 'E.T. ExTerrestrial Music', 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'duration': 143, + 'license': 'all-rights-reserved', } }, # not streamable song @@ -66,6 +67,7 @@ class SoundcloudIE(InfoExtractor): 'uploader': 'The Royal Concept', 'upload_date': '20120521', 'duration': 227, + 'license': 'all-rights-reserved', }, 'params': { # rtmp @@ -84,6 +86,7 @@ class SoundcloudIE(InfoExtractor): 'description': 'test chars: \"\'/\\ä↭', 'upload_date': '20131209', 'duration': 9, + 'license': 'all-rights-reserved', }, }, # private link (alt format) @@ -98,6 +101,7 @@ class SoundcloudIE(InfoExtractor): 'description': 'test chars: \"\'/\\ä↭', 'upload_date': '20131209', 'duration': 9, + 'license': 'all-rights-reserved', }, }, # downloadable song @@ -112,11 +116,12 @@ class SoundcloudIE(InfoExtractor): 'uploader': 'oddsamples', 'upload_date': '20140109', 'duration': 17, + 'license': 'cc-by-sa', }, }, ] - _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea' + _CLIENT_ID = 'fDoItMDbsbZz8dY16ZzARCZmzgHBPotA' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' @staticmethod @@ -138,20 +143,20 @@ class SoundcloudIE(InfoExtractor): name = full_title or track_id if quiet: self.report_extraction(name) - - thumbnail = info['artwork_url'] - if thumbnail is not None: + thumbnail = info.get('artwork_url') + if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') ext = 'mp3' result = { 'id': track_id, - 'uploader': info['user']['username'], - 'upload_date': unified_strdate(info['created_at']), + 'uploader': info.get('user', {}).get('username'), + 'upload_date': unified_strdate(info.get('created_at')), 'title': info['title'], - 'description': info['description'], + 'description': info.get('description'), 'thumbnail': thumbnail, 'duration': int_or_none(info.get('duration'), 1000), 'webpage_url': info.get('permalink_url'), + 'license': info.get('license'), } formats = [] if info.get('downloadable', False): @@ -168,46 +173,54 @@ class SoundcloudIE(InfoExtractor): }) # We have to retrieve the url - streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?' - 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token)) format_dict = self._download_json( - streams_url, - track_id, 'Downloading track url') + 'http://api.soundcloud.com/i1/tracks/%s/streams' % track_id, + track_id, 'Downloading track url', query={ + 'client_id': self._CLIENT_ID, + 'secret_token': secret_token, + }) for key, stream_url in format_dict.items(): + abr = int_or_none(self._search_regex( + r'_(\d+)_url', key, 'audio bitrate', default=None)) if key.startswith('http'): - formats.append({ + stream_formats = [{ 'format_id': key, 'ext': ext, 'url': stream_url, - 'vcodec': 'none', - }) + }] elif key.startswith('rtmp'): # The url doesn't have an rtmp app, we have to extract the playpath url, path = stream_url.split('mp3:', 1) - formats.append({ + stream_formats = [{ 'format_id': key, 'url': url, 'play_path': 'mp3:' + path, 'ext': 'flv', - 'vcodec': 'none', - }) + }] + elif key.startswith('hls'): + stream_formats = self._extract_m3u8_formats( + stream_url, track_id, 'mp3', entry_protocol='m3u8_native', + m3u8_id=key, fatal=False) + else: + continue - if not formats: - # We fallback to the stream_url in the original info, this - # cannot be always used, sometimes it can give an HTTP 404 error - formats.append({ - 'format_id': 'fallback', - 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, - 'ext': ext, - 'vcodec': 'none', - }) + for f in stream_formats: + f['abr'] = abr - for f in formats: - if f['format_id'].startswith('http'): - f['protocol'] = 'http' - if f['format_id'].startswith('rtmp'): - f['protocol'] = 'rtmp' + formats.extend(stream_formats) + + if not formats: + # We fallback to the stream_url in the original info, this + # cannot be always used, sometimes it can give an HTTP 404 error + formats.append({ + 'format_id': 'fallback', + 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, + 'ext': ext, + }) + + for f in formats: + f['vcodec'] = 'none' self._check_formats(formats, track_id) self._sort_formats(formats) @@ -221,7 +234,7 @@ class SoundcloudIE(InfoExtractor): raise ExtractorError('Invalid URL: %s' % url) track_id = mobj.group('track_id') - token = None + if track_id is not None: info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID full_title = track_id @@ -255,7 +268,20 @@ class SoundcloudIE(InfoExtractor): return self._extract_info_dict(info, full_title, secret_token=token) -class SoundcloudSetIE(SoundcloudIE): +class SoundcloudPlaylistBaseIE(SoundcloudIE): + @staticmethod + def _extract_id(e): + return compat_str(e['id']) if e.get('id') else None + + def _extract_track_entries(self, tracks): + return [ + self.url_result( + track['permalink_url'], SoundcloudIE.ie_key(), + video_id=self._extract_id(track)) + for track in tracks if track.get('permalink_url')] + + +class SoundcloudSetIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' IE_NAME = 'soundcloud:set' _TESTS = [{ @@ -265,6 +291,9 @@ class SoundcloudSetIE(SoundcloudIE): 'title': 'The Royal Concept EP', }, 'playlist_mincount': 6, + }, { + 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', + 'only_matching': True, }] def _real_extract(self, url): @@ -291,7 +320,7 @@ class SoundcloudSetIE(SoundcloudIE): msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) - entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']] + entries = self._extract_track_entries(info['tracks']) return { '_type': 'playlist', @@ -301,7 +330,7 @@ class SoundcloudSetIE(SoundcloudIE): } -class SoundcloudUserIE(SoundcloudIE): +class SoundcloudUserIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'''(?x) https?:// (?:(?:www|m)\.)?soundcloud\.com/ @@ -318,21 +347,21 @@ class SoundcloudUserIE(SoundcloudIE): 'id': '114582580', 'title': 'The Akashic Chronicler (All)', }, - 'playlist_mincount': 111, + 'playlist_mincount': 74, }, { 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', 'info_dict': { 'id': '114582580', 'title': 'The Akashic Chronicler (Tracks)', }, - 'playlist_mincount': 50, + 'playlist_mincount': 37, }, { 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', 'info_dict': { 'id': '114582580', 'title': 'The Akashic Chronicler (Playlists)', }, - 'playlist_mincount': 3, + 'playlist_mincount': 2, }, { 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', 'info_dict': { @@ -351,7 +380,7 @@ class SoundcloudUserIE(SoundcloudIE): 'url': 'https://soundcloud.com/grynpyret/spotlight', 'info_dict': { 'id': '7098329', - 'title': 'Grynpyret (Spotlight)', + 'title': 'GRYNPYRET (Spotlight)', }, 'playlist_mincount': 1, }] @@ -413,13 +442,14 @@ class SoundcloudUserIE(SoundcloudIE): for cand in candidates: if isinstance(cand, dict): permalink_url = cand.get('permalink_url') + entry_id = self._extract_id(cand) if permalink_url and permalink_url.startswith('http'): - return permalink_url + return permalink_url, entry_id for e in collection: - permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) + permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) if permalink_url: - entries.append(self.url_result(permalink_url)) + entries.append(self.url_result(permalink_url, video_id=entry_id)) next_href = response.get('next_href') if not next_href: @@ -439,7 +469,7 @@ class SoundcloudUserIE(SoundcloudIE): } -class SoundcloudPlaylistIE(SoundcloudIE): +class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' IE_NAME = 'soundcloud:playlist' _TESTS = [{ @@ -469,7 +499,7 @@ class SoundcloudPlaylistIE(SoundcloudIE): data = self._download_json( base_url + data, playlist_id, 'Downloading playlist') - entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']] + entries = self._extract_track_entries(data['tracks']) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index 3a4ddf57e..e004e2c5a 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -27,7 +27,7 @@ class SoundgasmIE(InfoExtractor): webpage = self._download_webpage(url, display_id) audio_url = self._html_search_regex( r'(?s)m4a\:\s"([^"]+)"', webpage, 'audio URL') - audio_id = re.split('\/|\.', audio_url)[-2] + audio_id = re.split(r'\/|\.', audio_url)[-2] description = self._html_search_regex( r'(?s)<li>Description:\s(.*?)<\/li>', webpage, 'description', fatal=False) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index a147f7db1..d8ce416fc 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor @@ -6,7 +6,7 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' @@ -35,6 +35,7 @@ class SouthParkEsIE(SouthParkIE): 'description': 'Cartman Consigue Una Sonda Anal', }, 'playlist_count': 4, + 'skip': 'Geo-restricted', }] @@ -74,7 +75,7 @@ class SouthParkDeIE(SouthParkIE): class SouthParkNlIE(SouthParkIE): IE_NAME = 'southpark.nl' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' _TESTS = [{ diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 50433d0f6..123c33ac3 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -14,8 +14,8 @@ class SpankBangIE(InfoExtractor): 'id': '3vvn', 'ext': 'mp4', 'title': 'fantasy solo', - 'description': 'dillion harper masturbates on a bed', - 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Watch fantasy solo free HD porn video - 05 minutes - dillion harper masturbates on a bed free adult movies.', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'silly2587', 'age_limit': 18, } @@ -44,12 +44,10 @@ class SpankBangIE(InfoExtractor): title = self._html_search_regex( r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title') - description = self._search_regex( - r'class="desc"[^>]*>([^<]+)', - webpage, 'description', default=None) + description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) uploader = self._search_regex( - r'class="user"[^>]*>([^<]+)', + r'class="user"[^>]*><img[^>]+>([^<]+)', webpage, 'uploader', fatal=False) age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 92a7120a3..44d8fa52f 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -85,7 +85,7 @@ class SpankwireIE(InfoExtractor): r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) heights = [int(video[0]) for video in videos] video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos])) - if webpage.find('flashvars\.encrypted = "true"') != -1: + if webpage.find(r'flashvars\.encrypted = "true"') != -1: password = self._search_regex( r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ') diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 3c552807e..ec1b60388 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -103,7 +103,7 @@ class SpiegelIE(InfoExtractor): class SpiegelArticleIE(InfoExtractor): - _VALID_URL = 'https?://www\.spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' IE_NAME = 'Spiegel:Article' IE_DESC = 'Articles on spiegel.de' _TESTS = [{ diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 034bd47ff..e1cfb8698 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -18,7 +18,7 @@ class SpiegeltvIE(InfoExtractor): 'ext': 'm4v', 'title': 'Flug MH370', 'description': 'Das Rätsel um die Boeing 777 der Malaysia-Airlines', - 'thumbnail': 're:http://.*\.jpg$', + 'thumbnail': r're:http://.*\.jpg$', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 218785ee4..c59896a17 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .mtv import MTVServicesInfoExtractor @@ -16,6 +18,15 @@ class SpikeIE(MTVServicesInfoExtractor): 'timestamp': 1388120400, 'upload_date': '20131227', }, + }, { + 'url': 'http://www.spike.com/full-episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-209', + 'md5': 'b25c6f16418aefb9ad5a6cae2559321f', + 'info_dict': { + 'id': '37ace3a8-1df6-48be-85b8-38df8229e241', + 'ext': 'mp4', + 'title': 'Lip Sync Battle|April 28, 2016|2|209|Joel McHale Vs. Jim Rash|Act 1', + 'description': 'md5:a739ca8f978a7802f67f8016d27ce114', + }, }, { 'url': 'http://www.spike.com/video-clips/lhtu8m/', 'only_matching': True, @@ -32,3 +43,12 @@ class SpikeIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.spike.com/feeds/mrss/' _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' + _CUSTOM_URL_REGEX = re.compile(r'spikenetworkapp://([^/]+/[-a-fA-F0-9]+)') + + def _extract_mgid(self, webpage): + mgid = super(SpikeIE, self)._extract_mgid(webpage) + if mgid is None: + url_parts = self._search_regex(self._CUSTOM_URL_REGEX, webpage, 'episode_id') + video_type, episode_id = url_parts.split('/', 1) + mgid = 'mgid:arc:{0}:spike.com:{1}'.format(video_type, episode_id) + return mgid diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py index 7e6783306..a417b5a4e 100644 --- a/youtube_dl/extractor/sport5.py +++ b/youtube_dl/extractor/sport5.py @@ -41,7 +41,7 @@ class Sport5IE(InfoExtractor): webpage = self._download_webpage(url, media_id) - video_id = self._html_search_regex('clipId=([\w-]+)', webpage, 'video id') + video_id = self._html_search_regex(r'clipId=([\w-]+)', webpage, 'video id') metadata = self._download_xml( 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index e5c28ae89..05a0b5a80 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -11,60 +11,6 @@ from ..utils import ( ) -class SportBoxIE(InfoExtractor): - _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' - _TESTS = [{ - 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', - 'info_dict': { - 'id': '80822', - 'ext': 'mp4', - 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', - 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad', - 'thumbnail': 're:^https?://.*\.jpg$', - 'upload_date': '20140928', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', - 'only_matching': True, - }, { - 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - player = self._search_regex( - r'src="/?(vdl/player/[^"]+)"', webpage, 'player') - - title = self._html_search_regex( - [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], - webpage, 'title') - description = self._og_search_description(webpage) or self._html_search_meta( - 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate(self._html_search_meta( - 'dateCreated', webpage, 'upload date')) - - return { - '_type': 'url_transparent', - 'url': compat_urlparse.urljoin(url, '/%s' % player), - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - } - - class SportBoxEmbedIE(InfoExtractor): _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' _TESTS = [{ @@ -73,7 +19,7 @@ class SportBoxEmbedIE(InfoExtractor): 'id': '211355', 'ext': 'mp4', 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index a9927f6e2..a3c35a899 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -20,8 +20,8 @@ class SportDeutschlandIE(InfoExtractor): 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', 'categories': ['Badminton'], 'view_count': int, - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': r're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', 'timestamp': int, 'upload_date': 're:^201408[23][0-9]$', }, @@ -38,7 +38,7 @@ class SportDeutschlandIE(InfoExtractor): 'timestamp': 1408976060, 'duration': 2732, 'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'view_count': int, 'categories': ['Li-Ning Badminton WM 2014'], diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index 246970c4d..319a48a7a 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, parse_iso8601, @@ -23,6 +24,16 @@ class SRGSSRIE(InfoExtractor): 'STARTDATE': 'This video is not yet available. Please try again later.', } + def _get_tokenized_src(self, url, video_id, format_id): + sp = compat_urllib_parse_urlparse(url).path.split('/') + token = self._download_json( + 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + video_id, 'Downloading %s token' % format_id, fatal=False) or {} + auth_params = token.get('token', {}).get('authparams') + if auth_params: + url += '?' + auth_params + return url + def get_media_data(self, bu, media_type, media_id): media_data = self._download_json( 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), @@ -37,9 +48,6 @@ class SRGSSRIE(InfoExtractor): def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() - if bu == 'rts': - return self.url_result('rts:%s' % media_id, 'RTS') - media_data = self.get_media_data(bu, media_type, media_id) metadata = media_data['AssetMetadatas']['AssetMetadata'][0] @@ -61,14 +69,16 @@ class SRGSSRIE(InfoExtractor): asset_url = asset['text'] quality = asset['@quality'] format_id = '%s-%s' % (protocol, quality) - if protocol == 'HTTP-HDS': - formats.extend(self._extract_f4m_formats( - asset_url + '?hdcore=3.4.0', media_id, - f4m_id=format_id, fatal=False)) - elif protocol == 'HTTP-HLS': - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) + if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): + asset_url = self._get_tokenized_src(asset_url, media_id, format_id) + if protocol.startswith('HTTP-HDS'): + formats.extend(self._extract_f4m_formats( + asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + elif protocol.startswith('HTTP-HLS'): + formats.extend(self._extract_m3u8_formats( + asset_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, @@ -94,10 +104,10 @@ class SRGSSRPlayIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': '4cd93523723beff51bb4bee974ee238d', + 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'ext': 'm4v', + 'ext': 'mp4', 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', 'timestamp': 1372713995, @@ -140,7 +150,7 @@ class SRGSSRPlayIE(InfoExtractor): 'uploader': '19h30', 'upload_date': '20141201', 'timestamp': 1417458600, - 'thumbnail': 're:^https?://.*\.image', + 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, 'params': { diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 409d50304..28baf901c 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .ard import ARDMediathekIE @@ -20,7 +20,7 @@ class SRMediathekIE(ARDMediathekIE): 'ext': 'mp4', 'title': 'sportarena (26.10.2014)', 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'no longer available', }, { diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py index 4a3d8bb8f..cce65fb10 100644 --- a/youtube_dl/extractor/stanfordoc.py +++ b/youtube_dl/extractor/stanfordoc.py @@ -66,7 +66,7 @@ class StanfordOpenClassroomIE(InfoExtractor): r'(?s)<description>([^<]+)</description>', coursepage, 'description', fatal=False) - links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) + links = orderedSet(re.findall(r'<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['entries'] = [self.url_result( 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) ) for l in links] @@ -84,7 +84,7 @@ class StanfordOpenClassroomIE(InfoExtractor): rootpage = self._download_webpage(rootURL, info['id'], errnote='Unable to download course info page') - links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) + links = orderedSet(re.findall(r'<a href="(CoursePage.php\?[^"]+)">', rootpage)) info['entries'] = [self.url_result( 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) ) for l in links] diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index 0f8782d03..97d1ff681 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -22,7 +22,7 @@ class StitcherIE(InfoExtractor): 'title': 'Machine Learning Mastery and Cancer Clusters', 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', 'duration': 1604, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -33,7 +33,7 @@ class StitcherIE(InfoExtractor): 'title': "The CW's 'Crazy Ex-Girlfriend'", 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17', 'duration': 2235, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py index 1c61437a4..e973c867c 100644 --- a/youtube_dl/extractor/streamable.py +++ b/youtube_dl/extractor/streamable.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -19,7 +21,7 @@ class StreamableIE(InfoExtractor): 'id': 'dnd1', 'ext': 'mp4', 'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'uploader': 'teabaker', 'timestamp': 1454964157.35115, 'upload_date': '20160208', @@ -35,7 +37,7 @@ class StreamableIE(InfoExtractor): 'id': 'moo', 'ext': 'mp4', 'title': '"Please don\'t eat me!"', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'timestamp': 1426115495, 'upload_date': '20150311', 'duration': 12, @@ -48,6 +50,14 @@ class StreamableIE(InfoExtractor): } ] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)', + webpage) + if mobj: + return mobj.group('src') + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index d3d2b7eb7..9e533103c 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import hashlib diff --git a/youtube_dl/extractor/streetvoice.py b/youtube_dl/extractor/streetvoice.py index e529051d1..91612c7f2 100644 --- a/youtube_dl/extractor/streetvoice.py +++ b/youtube_dl/extractor/streetvoice.py @@ -16,7 +16,7 @@ class StreetVoiceIE(InfoExtractor): 'ext': 'mp3', 'title': '輸', 'description': 'Crispy脆樂團 - 輸', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 260, 'upload_date': '20091018', 'uploader': 'Crispy脆樂團', diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index e527aa971..68051169b 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -12,25 +12,29 @@ from ..utils import ( class SunPornoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sunporno\.com/videos/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www\.)?sunporno\.com/videos|embeds\.sunporno\.com/embed)/(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.sunporno.com/videos/807778/', - 'md5': '6457d3c165fd6de062b99ef6c2ff4c86', + 'md5': '507887e29033502f29dba69affeebfc9', 'info_dict': { 'id': '807778', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'md5:0a400058e8105d39e35c35e7c5184164', 'description': 'md5:a31241990e1bd3a64e72ae99afb325fb', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 302, 'age_limit': 18, } - } + }, { + 'url': 'http://embeds.sunporno.com/embed/807778', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://www.sunporno.com/videos/%s' % video_id, video_id) title = self._html_search_regex( r'<title>([^<]+)', webpage, 'title') @@ -40,7 +44,8 @@ class SunPornoIE(InfoExtractor): r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( - r'itemprop="duration">\s*(\d+:\d+)\s*<', + (r'itemprop="duration"[^>]*>\s*(\d+:\d+)\s*<', + r'>Duration:\s*]+>\s*(\d+:\d+)\s*<'), webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( @@ -48,7 +53,7 @@ class SunPornoIE(InfoExtractor): webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( r'(\d+) Comments?', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', fatal=False, default=None)) formats = [] quality = qualities(['mp4', 'flv']) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 1c04dfb7b..10cf80885 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -16,7 +16,7 @@ class SVTBaseIE(InfoExtractor): def _extract_video(self, video_info, video_id): formats = [] for vr in video_info['videoReferences']: - player_type = vr.get('playerType') + player_type = vr.get('playerType') or vr.get('format') vurl = vr['url'] ext = determine_ext(vurl) if ext == 'm3u8': @@ -129,7 +129,7 @@ class SVTPlayIE(SVTBaseIE): 'ext': 'mp4', 'title': 'Flygplan till Haile Selassie', 'duration': 3527, - 'thumbnail': 're:^https?://.*[\.-]jpg$', + 'thumbnail': r're:^https?://.*[\.-]jpg$', 'age_limit': 0, 'subtitles': { 'sv': [{ diff --git a/youtube_dl/extractor/swrmediathek.py b/youtube_dl/extractor/swrmediathek.py index 58073eefe..0f615979e 100644 --- a/youtube_dl/extractor/swrmediathek.py +++ b/youtube_dl/extractor/swrmediathek.py @@ -1,10 +1,12 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import parse_duration +from ..utils import ( + parse_duration, + int_or_none, + determine_protocol, +) class SWRMediathekIE(InfoExtractor): @@ -18,7 +20,7 @@ class SWRMediathekIE(InfoExtractor): 'ext': 'mp4', 'title': 'SWR odysso', 'description': 'md5:2012e31baad36162e97ce9eb3f157b8a', - 'thumbnail': 're:^http:.*\.jpg$', + 'thumbnail': r're:^http:.*\.jpg$', 'duration': 2602, 'upload_date': '20140515', 'uploader': 'SWR Fernsehen', @@ -32,12 +34,13 @@ class SWRMediathekIE(InfoExtractor): 'ext': 'mp4', 'title': 'Nachtcafé - Alltagsdroge Alkohol - zwischen Sektempfang und Komasaufen', 'description': 'md5:e0a3adc17e47db2c23aab9ebc36dbee2', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'duration': 5305, 'upload_date': '20140516', 'uploader': 'SWR Fernsehen', 'uploader_id': '990030', }, + 'skip': 'redirect to http://swrmediathek.de/index.htm?hinweis=swrlink', }, { 'url': 'http://swrmediathek.de/player.htm?show=bba23e10-cb93-11e3-bf7f-0026b975f2e6', 'md5': '4382e4ef2c9d7ce6852535fa867a0dd3', @@ -46,59 +49,67 @@ class SWRMediathekIE(InfoExtractor): 'ext': 'mp3', 'title': 'Saša Stanišic: Vor dem Fest', 'description': 'md5:5b792387dc3fbb171eb709060654e8c9', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'duration': 3366, 'upload_date': '20140520', 'uploader': 'SWR 2', 'uploader_id': '284670', - } + }, + 'skip': 'redirect to http://swrmediathek.de/index.htm?hinweis=swrlink', }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) video = self._download_json( - 'http://swrmediathek.de/AjaxEntry?ekey=%s' % video_id, video_id, 'Downloading video JSON') + 'http://swrmediathek.de/AjaxEntry?ekey=%s' % video_id, + video_id, 'Downloading video JSON') attr = video['attr'] - media_type = attr['entry_etype'] + title = attr['entry_title'] + media_type = attr.get('entry_etype') formats = [] - for entry in video['sub']: - if entry['name'] != 'entry_media': + for entry in video.get('sub', []): + if entry.get('name') != 'entry_media': continue - entry_attr = entry['attr'] - codec = entry_attr['val0'] - quality = int(entry_attr['val1']) - - fmt = { - 'url': entry_attr['val2'], - 'quality': quality, - } - - if media_type == 'Video': - fmt.update({ - 'format_note': ['144p', '288p', '544p', '720p'][quality - 1], - 'vcodec': codec, + entry_attr = entry.get('attr', {}) + f_url = entry_attr.get('val2') + if not f_url: + continue + codec = entry_attr.get('val0') + if codec == 'm3u8': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif codec == 'f4m': + formats.extend(self._extract_f4m_formats( + f_url + '?hdcore=3.7.0', video_id, + f4m_id='hds', fatal=False)) + else: + formats.append({ + 'format_id': determine_protocol({'url': f_url}), + 'url': f_url, + 'quality': int_or_none(entry_attr.get('val1')), + 'vcodec': codec if media_type == 'Video' else 'none', + 'acodec': codec if media_type == 'Audio' else None, }) - elif media_type == 'Audio': - fmt.update({ - 'acodec': codec, - }) - formats.append(fmt) - self._sort_formats(formats) + upload_date = None + entry_pdatet = attr.get('entry_pdatet') + if entry_pdatet: + upload_date = entry_pdatet[:-4] + return { 'id': video_id, - 'title': attr['entry_title'], - 'description': attr['entry_descl'], - 'thumbnail': attr['entry_image_16_9'], - 'duration': parse_duration(attr['entry_durat']), - 'upload_date': attr['entry_pdatet'][:-4], - 'uploader': attr['channel_title'], - 'uploader_id': attr['channel_idkey'], + 'title': title, + 'description': attr.get('entry_descl'), + 'thumbnail': attr.get('entry_image_16_9'), + 'duration': parse_duration(attr.get('entry_durat')), + 'upload_date': upload_date, + 'uploader': attr.get('channel_title'), + 'uploader_id': attr.get('channel_idkey'), 'formats': formats, } diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py index 53723b66e..def7e5a2c 100644 --- a/youtube_dl/extractor/syfy.py +++ b/youtube_dl/extractor/syfy.py @@ -1,14 +1,14 @@ from __future__ import unicode_literals -from .theplatform import ThePlatformIE +from .adobepass import AdobePassIE from ..utils import ( update_url_query, smuggle_url, ) -class SyfyIE(ThePlatformIE): - _VALID_URL = r'https?://www\.syfy\.com/(?:[^/]+/)?videos/(?P[^/?#]+)' +class SyfyIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?syfy\.com/(?:[^/]+/)?videos/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.syfy.com/theinternetruinedmylife/videos/the-internet-ruined-my-life-season-1-trailer', 'info_dict': { @@ -31,7 +31,7 @@ class SyfyIE(ThePlatformIE): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) syfy_mpx = list(self._parse_json(self._search_regex( - r'jQuery\.extend\([^,]+,\s*({.+})\);', webpage, 'drupal settings'), + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', webpage, 'drupal settings'), display_id)['syfy']['syfy_mpx'].values())[0] video_id = syfy_mpx['mpxGUID'] title = syfy_mpx['episodeTitle'] @@ -40,7 +40,9 @@ class SyfyIE(ThePlatformIE): 'manifest': 'm3u', } if syfy_mpx.get('entitlement') == 'auth': - resource = 'syfy<![CDATA[%s]]>%s%s' % (title, video_id, syfy_mpx.get('mpxRating', 'TV-14')) + resource = self._get_mvpd_resource( + 'syfy', title, video_id, + syfy_mpx.get('mpxRating', 'TV-14')) query['auth'] = self._extract_mvpd_auth( url, video_id, 'syfy', resource) diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index f562aa6d3..cfad33146 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 136e18f96..c351b7545 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -23,7 +23,7 @@ class TagesschauPlayerIE(InfoExtractor): 'id': '179517', 'ext': 'mp4', 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', - 'thumbnail': 're:^https?:.*\.jpg$', + 'thumbnail': r're:^https?:.*\.jpg$', 'formats': 'mincount:6', }, }, { @@ -33,7 +33,7 @@ class TagesschauPlayerIE(InfoExtractor): 'id': '29417', 'ext': 'mp3', 'title': 'Trabi - Bye, bye Rennpappe', - 'thumbnail': 're:^https?:.*\.jpg$', + 'thumbnail': r're:^https?:.*\.jpg$', 'formats': 'mincount:2', }, }, { @@ -135,7 +135,7 @@ class TagesschauIE(InfoExtractor): 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', 'description': '18.07.2015 20:10 Uhr', - 'thumbnail': 're:^https?:.*\.jpg$', + 'thumbnail': r're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', @@ -145,7 +145,7 @@ class TagesschauIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', 'description': 'md5:695c01bfd98b7e313c501386327aea59', - 'thumbnail': 're:^https?:.*\.jpg$', + 'thumbnail': r're:^https?:.*\.jpg$', }, }, { # exclusive audio @@ -156,7 +156,7 @@ class TagesschauIE(InfoExtractor): 'ext': 'mp3', 'title': 'Trabi - Bye, bye Rennpappe', 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', - 'thumbnail': 're:^https?:.*\.jpg$', + 'thumbnail': r're:^https?:.*\.jpg$', }, }, { # audio in article @@ -167,7 +167,7 @@ class TagesschauIE(InfoExtractor): 'ext': 'mp3', 'title': 'Viele Baustellen für neuen BND-Chef', 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', - 'thumbnail': 're:^https?:.*\.jpg$', + 'thumbnail': r're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py deleted file mode 100644 index ed560bd24..000000000 --- a/youtube_dl/extractor/tapely.py +++ /dev/null @@ -1,109 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - ExtractorError, - float_or_none, - parse_iso8601, - sanitized_Request, -) - - -class TapelyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P[A-Za-z0-9\-_]+)(?:/(?P\d+))?' - _API_URL = 'http://tape.ly/showtape?id={0:}' - _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' - _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' - _TESTS = [ - { - 'url': 'http://tape.ly/my-grief-as-told-by-water', - 'info_dict': { - 'id': 23952, - 'title': 'my grief as told by water', - 'thumbnail': 're:^https?://.*\.png$', - 'uploader_id': 16484, - 'timestamp': 1411848286, - 'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', - }, - 'playlist_count': 13, - }, - { - 'url': 'http://tape.ly/my-grief-as-told-by-water/1', - 'md5': '79031f459fdec6530663b854cbc5715c', - 'info_dict': { - 'id': 258464, - 'title': 'Dreaming Awake (My Brightest Diamond)', - 'ext': 'm4a', - }, - }, - { - 'url': 'https://tapely.com/my-grief-as-told-by-water', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - playlist_url = self._API_URL.format(display_id) - request = sanitized_Request(playlist_url) - request.add_header('X-Requested-With', 'XMLHttpRequest') - request.add_header('Accept', 'application/json') - request.add_header('Referer', url) - - playlist = self._download_json(request, display_id) - - tape = playlist['tape'] - - entries = [] - for s in tape['songs']: - song = s['song'] - entry = { - 'id': song['id'], - 'duration': float_or_none(song.get('songduration'), 1000), - 'title': song['title'], - } - if song['source'] == 'S3': - entry.update({ - 'url': self._S3_SONG_URL.format(song['filename']), - }) - entries.append(entry) - elif song['source'] == 'YT': - self.to_screen('YouTube video detected') - yt_id = song['filename'].replace('/youtube/', '') - entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) - entries.append(entry) - elif song['source'] == 'SC': - self.to_screen('SoundCloud song detected') - sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) - entry.update(self.url_result(sc_url, 'Soundcloud')) - entries.append(entry) - else: - self.report_warning('Unknown song source: %s' % song['source']) - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - try: - return entries[songnr] - except IndexError: - raise ExtractorError( - 'No song with index: %s' % mobj.group('songnr'), - expected=True) - - return { - '_type': 'playlist', - 'id': tape['id'], - 'display_id': display_id, - 'title': tape['name'], - 'entries': entries, - 'thumbnail': tape.get('image_url'), - 'description': clean_html(tape.get('subtext')), - 'like_count': tape.get('likescount'), - 'uploader_id': tape.get('user_id'), - 'timestamp': parse_iso8601(tape.get('published_at')), - } diff --git a/youtube_dl/extractor/tass.py b/youtube_dl/extractor/tass.py index c4ef70778..6d336da78 100644 --- a/youtube_dl/extractor/tass.py +++ b/youtube_dl/extractor/tass.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import json @@ -21,7 +21,7 @@ class TassIE(InfoExtractor): 'ext': 'mp4', 'title': 'Посетителям московского зоопарка показали красную панду', 'description': 'Приехавшую из Дублина Зейну можно увидеть в павильоне "Кошки тропиков"', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py new file mode 100644 index 000000000..bf93eb868 --- /dev/null +++ b/youtube_dl/extractor/tbs.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE +from ..utils import extract_attributes + + +class TBSIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?Ptbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P[^/?#]+)\.html' + _TESTS = [{ + 'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', + 'md5': '9e61d680e2285066ade7199e6408b2ee', + 'info_dict': { + 'id': '2007318', + 'ext': 'mp4', + 'title': 'Theatrical Trailer', + 'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', + } + }, { + 'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html', + 'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', + 'info_dict': { + 'id': '1538823', + 'ext': 'mp4', + 'title': 'You Better Run', + 'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', + } + }] + + def _real_extract(self, url): + domain, display_id = re.match(self._VALID_URL, url).groups() + site = domain[:3] + webpage = self._download_webpage(url, display_id) + video_params = extract_attributes(self._search_regex(r'(<[^>]+id="page-video"[^>]*>)', webpage, 'video params')) + query = None + clip_id = video_params.get('clipid') + if clip_id: + query = 'id=' + clip_id + else: + query = 'titleId=' + video_params['titleid'] + return self._extract_cvp_info( + 'http://www.%s.com/service/cvpXml?%s' % (domain, query), display_id, { + 'default': { + 'media_src': 'http://ht.cdn.turner.com/%s/big' % site, + }, + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/%s/big' % site, + 'tokenizer_src': 'http://www.%s.com/video/processors/services/token_ipadAdobe.do' % domain, + }, + }, { + 'url': url, + 'site_name': site.upper(), + 'auth_required': video_params.get('isAuthRequired') != 'false', + }) diff --git a/youtube_dl/extractor/tdslifeway.py b/youtube_dl/extractor/tdslifeway.py index 4d1f5c801..101c6ee31 100644 --- a/youtube_dl/extractor/tdslifeway.py +++ b/youtube_dl/extractor/tdslifeway.py @@ -13,7 +13,7 @@ class TDSLifewayIE(InfoExtractor): 'id': '3453494717001', 'ext': 'mp4', 'title': 'The Gospel by Numbers', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'upload_date': '20140410', 'description': 'Coming soon from T4G 2014!', 'uploader_id': '2034960640001', diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 82675431f..f14713a78 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -24,7 +24,7 @@ class TeacherTubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Measures of dispersion from a frequency table', 'description': 'Measures of dispersion from a frequency table', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, }, { 'url': 'http://www.teachertube.com/viewVideo.php?video_id=340064', @@ -34,7 +34,7 @@ class TeacherTubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'How to Make Paper Dolls _ Paper Art Projects', 'description': 'Learn how to make paper dolls in this simple', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, }, { 'url': 'http://www.teachertube.com/music.php?music_id=8805', diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py index d14d93e3a..e89759714 100644 --- a/youtube_dl/extractor/teachingchannel.py +++ b/youtube_dl/extractor/teachingchannel.py @@ -7,7 +7,7 @@ from .ooyala import OoyalaIE class TeachingChannelIE(InfoExtractor): - _VALID_URL = r'https?://www\.teachingchannel\.org/videos/(?P.+)' + _VALID_URL = r'https?://(?:www\.)?teachingchannel\.org/videos/(?P<title>.+)' _TEST = { 'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution', diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 79a778920..75346393b 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import base64 diff --git a/youtube_dl/extractor/teamfourstar.py b/youtube_dl/extractor/teamfourstar.py new file mode 100644 index 000000000..a8c6ed7be --- /dev/null +++ b/youtube_dl/extractor/teamfourstar.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from ..utils import unified_strdate + + +class TeamFourStarIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/(?P<id>[a-z0-9\-]+)' + _TEST = { + 'url': 'http://teamfourstar.com/tfs-abridged-parody-episode-1-2/', + 'info_dict': { + 'id': '0WdZO31W', + 'title': 'TFS Abridged Parody Episode 1', + 'description': 'md5:d60bc389588ebab2ee7ad432bda953ae', + 'ext': 'mp4', + 'timestamp': 1394168400, + 'upload_date': '20080508', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + jwplatform_url = JWPlatformIE._extract_url(webpage) + + video_title = self._html_search_regex( + r'<h1[^>]+class="entry-title"[^>]*>(?P<title>.+?)</h1>', + webpage, 'title') + video_date = unified_strdate(self._html_search_regex( + r'<span[^>]+class="meta-date date updated"[^>]*>(?P<date>.+?)</span>', + webpage, 'date', fatal=False)) + video_description = self._html_search_regex( + r'(?s)<div[^>]+class="content-inner"[^>]*>.*?(?P<description><p>.+?)</div>', + webpage, 'description', fatal=False) + video_thumbnail = self._og_search_thumbnail(webpage) + + return { + '_type': 'url_transparent', + 'display_id': display_id, + 'title': video_title, + 'description': video_description, + 'upload_date': video_date, + 'thumbnail': video_thumbnail, + 'url': jwplatform_url, + } diff --git a/youtube_dl/extractor/techtalks.py b/youtube_dl/extractor/techtalks.py index 16e945d8e..a5b62c717 100644 --- a/youtube_dl/extractor/techtalks.py +++ b/youtube_dl/extractor/techtalks.py @@ -10,9 +10,9 @@ from ..utils import ( class TechTalksIE(InfoExtractor): - _VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/' + _VALID_URL = r'https?://techtalks\.tv/talks/(?:[^/]+/)?(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/', 'info_dict': { 'id': '57758', @@ -38,7 +38,10 @@ class TechTalksIE(InfoExtractor): # rtmp download 'skip_download': True, }, - } + }, { + 'url': 'http://techtalks.tv/talks/57758', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 451cde76d..1b1afab32 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -47,7 +47,7 @@ class TEDIE(InfoExtractor): 'id': 'tSVI8ta_P4w', 'ext': 'mp4', 'title': 'Vishal Sikka: The beauty and power of algorithms', - 'thumbnail': 're:^https?://.+\.jpg', + 'thumbnail': r're:^https?://.+\.jpg', 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', 'upload_date': '20140122', 'uploader_id': 'TEDInstitute', @@ -189,7 +189,7 @@ class TEDIE(InfoExtractor): 'format_id': '%s-%sk' % (format_id, bitrate), 'tbr': bitrate, }) - if re.search('\d+k', h264_url): + if re.search(r'\d+k', h264_url): http_url = h264_url elif format_id == 'rtmp': streamer = talk_info.get('streamer') diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py index eefecc490..5886e9c1b 100644 --- a/youtube_dl/extractor/telebruxelles.py +++ b/youtube_dl/extractor/telebruxelles.py @@ -7,33 +7,30 @@ from .common import InfoExtractor class TeleBruxellesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(news|sport|dernier-jt)/?(?P<id>[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(news|sport|dernier-jt|emission)/?(?P<id>[^/#?]+)' _TESTS = [{ - 'url': 'http://www.telebruxelles.be/news/auditions-devant-parlement-francken-galant-tres-attendus/', - 'md5': '59439e568c9ee42fb77588b2096b214f', + 'url': 'http://bx1.be/news/que-risque-lauteur-dune-fausse-alerte-a-la-bombe/', + 'md5': 'a2a67a5b1c3e8c9d33109b902f474fd9', 'info_dict': { - 'id': '11942', - 'display_id': 'auditions-devant-parlement-francken-galant-tres-attendus', - 'ext': 'flv', - 'title': 'Parlement : Francken et Galant répondent aux interpellations de l’opposition', - 'description': 're:Les auditions des ministres se poursuivent*' - }, - 'params': { - 'skip_download': 'requires rtmpdump' + 'id': '158856', + 'display_id': 'que-risque-lauteur-dune-fausse-alerte-a-la-bombe', + 'ext': 'mp4', + 'title': 'Que risque l’auteur d’une fausse alerte à la bombe ?', + 'description': 'md5:3cf8df235d44ebc5426373050840e466', }, }, { - 'url': 'http://www.telebruxelles.be/sport/basket-brussels-bat-mons-80-74/', - 'md5': '181d3fbdcf20b909309e5aef5c6c6047', + 'url': 'http://bx1.be/sport/futsal-schaerbeek-sincline-5-3-a-thulin/', + 'md5': 'dfe07ecc9c153ceba8582ac912687675', 'info_dict': { - 'id': '10091', - 'display_id': 'basket-brussels-bat-mons-80-74', - 'ext': 'flv', - 'title': 'Basket : le Brussels bat Mons 80-74', - 'description': 're:^Ils l\u2019on fait ! En basket, le B*', - }, - 'params': { - 'skip_download': 'requires rtmpdump' + 'id': '158433', + 'display_id': 'futsal-schaerbeek-sincline-5-3-a-thulin', + 'ext': 'mp4', + 'title': 'Futsal : Schaerbeek s’incline 5-3 à Thulin', + 'description': 'md5:fd013f1488d5e2dceb9cebe39e2d569b', }, + }, { + 'url': 'http://bx1.be/emission/bxenf1-gastronomie/', + 'only_matching': True, }] def _real_extract(self, url): @@ -50,13 +47,13 @@ class TeleBruxellesIE(InfoExtractor): r'file\s*:\s*"(rtmp://[^/]+/vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*".mp4)"', webpage, 'RTMP url') rtmp_url = re.sub(r'"\s*\+\s*"', '', rtmp_url) + formats = self._extract_wowza_formats(rtmp_url, article_id or display_id) + self._sort_formats(formats) return { 'id': article_id or display_id, 'display_id': display_id, 'title': title, 'description': description, - 'url': rtmp_url, - 'ext': 'flv', - 'rtmp_live': True # if rtmpdump is not called with "--live" argument, the download is blocked and can be completed + 'formats': formats, } diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 2ecfd0405..d5abfc9e4 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -6,7 +6,7 @@ from .mitele import MiTeleBaseIE class TelecincoIE(MiTeleBaseIE): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' - _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' + _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 58078c531..0f576c1ab 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -17,7 +17,7 @@ class TelegraafIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tikibad ontruimd wegens brand', 'description': 'md5:05ca046ff47b931f9b04855015e163a4', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 33, }, 'params': { diff --git a/youtube_dl/extractor/telemb.py b/youtube_dl/extractor/telemb.py index 1bbd0e7bd..9bcac4ec0 100644 --- a/youtube_dl/extractor/telemb.py +++ b/youtube_dl/extractor/telemb.py @@ -19,7 +19,7 @@ class TeleMBIE(InfoExtractor): 'ext': 'mp4', 'title': 'Mons - Cook with Danielle : des cours de cuisine en anglais ! - Les reportages', 'description': 'md5:bc5225f47b17c309761c856ad4776265', - 'thumbnail': 're:^http://.*\.(?:jpg|png)$', + 'thumbnail': r're:^http://.*\.(?:jpg|png)$', } }, { @@ -32,7 +32,7 @@ class TeleMBIE(InfoExtractor): 'ext': 'mp4', 'title': 'Havré - Incendie mortel - Les reportages', 'description': 'md5:5e54cb449acb029c2b7734e2d946bd4a', - 'thumbnail': 're:^http://.*\.(?:jpg|png)$', + 'thumbnail': r're:^http://.*\.(?:jpg|png)$', } }, ] diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py new file mode 100644 index 000000000..4043fcb92 --- /dev/null +++ b/youtube_dl/extractor/telequebec.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TeleQuebecIE(InfoExtractor): + _VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P<id>\d+)' + _TEST = { + 'url': 'http://zonevideo.telequebec.tv/media/20984/le-couronnement-de-new-york/couronnement-de-new-york', + 'md5': 'fe95a0957e5707b1b01f5013e725c90f', + 'info_dict': { + 'id': '20984', + 'ext': 'mp4', + 'title': 'Le couronnement de New York', + 'description': 'md5:f5b3d27a689ec6c1486132b2d687d432', + 'upload_date': '20160220', + 'timestamp': 1455965438, + } + } + + def _real_extract(self, url): + media_id = self._match_id(url) + media_data = self._download_json( + 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, + media_id)['media'] + return { + '_type': 'url_transparent', + 'id': media_id, + 'url': 'limelight:media:' + media_data['streamInfo']['sourceId'], + 'title': media_data['title'], + 'description': media_data.get('descriptions', [{'text': None}])[0].get('text'), + 'duration': int_or_none(media_data.get('durationInMilliseconds'), 1000), + 'ie_key': 'LimelightMedia', + } diff --git a/youtube_dl/extractor/telewebion.py b/youtube_dl/extractor/telewebion.py index 77916c601..1207b1a1b 100644 --- a/youtube_dl/extractor/telewebion.py +++ b/youtube_dl/extractor/telewebion.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class TelewebionIE(InfoExtractor): - _VALID_URL = r'https?://www\.telewebion\.com/#!/episode/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?telewebion\.com/#!/episode/(?P<id>\d+)' _TEST = { 'url': 'http://www.telewebion.com/#!/episode/1263668/', @@ -13,7 +13,7 @@ class TelewebionIE(InfoExtractor): 'id': '1263668', 'ext': 'mp4', 'title': 'قرعه\u200cکشی لیگ قهرمانان اروپا', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, }, 'params': { diff --git a/youtube_dl/extractor/tfo.py b/youtube_dl/extractor/tfo.py new file mode 100644 index 000000000..6f1eeac57 --- /dev/null +++ b/youtube_dl/extractor/tfo.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + ExtractorError, + int_or_none, +) + + +class TFOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P<id>\d+)' + _TEST = { + 'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon', + 'md5': '47c987d0515561114cf03d1226a9d4c7', + 'info_dict': { + 'id': '100463871', + 'ext': 'mp4', + 'title': 'Video Game Hackathon', + 'description': 'md5:558afeba217c6c8d96c60e5421795c07', + 'upload_date': '20160212', + 'timestamp': 1455310233, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + self._request_webpage(HEADRequest('http://www.tfo.org/'), video_id) + infos = self._download_json( + 'http://www.tfo.org/api/web/video/get_infos', video_id, data=json.dumps({ + 'product_id': video_id, + }).encode(), headers={ + 'X-tfo-session': self._get_cookies('http://www.tfo.org/')['tfo-session'].value, + }) + if infos.get('success') == 0: + raise ExtractorError('%s said: %s' % (self.IE_NAME, infos['msg']), expected=True) + video_data = infos['data'] + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:' + video_data['llid'], + 'title': video_data['title'], + 'description': video_data.get('description'), + 'series': video_data.get('collection'), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'duration': int_or_none(video_data.get('duration')), + 'ie_key': 'LimelightMedia', + } diff --git a/youtube_dl/extractor/theintercept.py b/youtube_dl/extractor/theintercept.py index 8cb3c3669..f23b58713 100644 --- a/youtube_dl/extractor/theintercept.py +++ b/youtube_dl/extractor/theintercept.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -11,7 +11,7 @@ from ..utils import ( class TheInterceptIE(InfoExtractor): - _VALID_URL = r'https://theintercept.com/fieldofvision/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://theintercept\.com/fieldofvision/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/', 'md5': '145f28b41d44aab2f87c0a4ac8ec95bd', diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index bb3efc4ea..192d8fa29 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -6,10 +6,10 @@ import time import hmac import binascii import hashlib -import netrc from .once import OnceIE +from .adobepass import AdobePassIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -25,9 +25,6 @@ from ..utils import ( xpath_with_ns, mimetype2ext, find_xpath_attr, - unescapeHTML, - urlencode_postdata, - unified_timestamp, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -36,7 +33,9 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): - meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'}) + meta = self._download_xml( + smil_url, video_id, note=note, query={'format': 'SMIL'}, + headers=self.geo_verification_headers()) error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') if error_element is not None and error_element.attrib['src'].startswith( 'http://link.theplatform.com/s/errorFiles/Unavailable.'): @@ -76,10 +75,10 @@ class ThePlatformBaseIE(OnceIE): if isinstance(captions, list): for caption in captions: lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') - subtitles[lang] = [{ + subtitles.setdefault(lang, []).append({ 'ext': mimetype2ext(mime), 'url': src, - }] + }) return { 'title': info['title'], @@ -96,10 +95,10 @@ class ThePlatformBaseIE(OnceIE): return self._parse_theplatform_metadata(info) -class ThePlatformIE(ThePlatformBaseIE): +class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ - (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? + (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? |theplatform:)(?P<id>[^/\?&]+)''' _TESTS = [{ @@ -119,6 +118,7 @@ class ThePlatformIE(ThePlatformBaseIE): # rtmp download 'skip_download': True, }, + 'skip': '404 Not Found', }, { # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/ 'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT', @@ -156,7 +156,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'title': 'iPhone Siri’s sassy response to a math question has people talking', 'description': 'md5:a565d1deadd5086f3331d57298ec6333', 'duration': 83.0, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1435752600, 'upload_date': '20150701', 'uploader': 'NBCU-NEWS', @@ -167,7 +167,6 @@ class ThePlatformIE(ThePlatformBaseIE): 'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781', 'only_matching': True, }] - _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' @classmethod def _extract_urls(cls, webpage): @@ -202,96 +201,6 @@ class ThePlatformIE(ThePlatformBaseIE): sig = flags + expiration_date + checksum + str_to_hex(sig_secret) return '%s&sig=%s' % (url, sig) - def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): - def xml_text(xml_str, tag): - return self._search_regex( - '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag) - - mvpd_headers = { - 'ap_42': 'anonymous', - 'ap_11': 'Linux i686', - 'ap_z': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', - 'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', - } - - guid = xml_text(resource, 'guid') - requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} - authn_token = requestor_info.get('authn_token') - if authn_token: - token_expires = unified_timestamp(xml_text(authn_token, 'simpleTokenExpires').replace('_GMT', '')) - if token_expires and token_expires >= time.time(): - authn_token = None - if not authn_token: - # TODO add support for other TV Providers - mso_id = 'DTV' - login_info = netrc.netrc().authenticators(mso_id) - if not login_info: - return None - - def post_form(form_page, note, data={}): - post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') - return self._download_webpage( - post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - - provider_redirect_page = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }) - provider_login_page = post_form( - provider_redirect_page, 'Downloading Provider Login Page') - mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { - 'username': login_info[0], - 'password': login_info[2], - }) - post_form(mvpd_confirm_page, 'Confirming Login') - - session = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, - 'Retrieving Session', data=urlencode_postdata({ - '_method': 'GET', - 'requestor_id': requestor_id, - }), headers=mvpd_headers) - authn_token = unescapeHTML(xml_text(session, 'authnToken')) - requestor_info['authn_token'] = authn_token - self._downloader.cache.store('mvpd', requestor_id, requestor_info) - - authz_token = requestor_info.get(guid) - if not authz_token: - authorize = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, - 'Retrieving Authorization Token', data=urlencode_postdata({ - 'resource_id': resource, - 'requestor_id': requestor_id, - 'authentication_token': authn_token, - 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), - 'userMeta': '1', - }), headers=mvpd_headers) - authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) - requestor_info[guid] = authz_token - self._downloader.cache.store('mvpd', requestor_id, requestor_info) - - mvpd_headers.update({ - 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), - 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), - }) - - return self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', - video_id, 'Retrieving Media Token', data=urlencode_postdata({ - 'authz_token': authz_token, - 'requestor_id': requestor_id, - 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), - 'hashed_guid': 'false', - }), headers=mvpd_headers) - def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -388,7 +297,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): 'ext': 'mp4', 'title': 'The Biden factor: will Joe run in 2016?', 'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140208', 'timestamp': 1391824260, 'duration': 467.0, diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py index 3e4e14031..ce1326c03 100644 --- a/youtube_dl/extractor/thescene.py +++ b/youtube_dl/extractor/thescene.py @@ -7,7 +7,7 @@ from ..utils import qualities class TheSceneIE(InfoExtractor): - _VALID_URL = r'https://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)' + _VALID_URL = r'https?://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)' _TEST = { 'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear', diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py index ba1380abc..c3f118894 100644 --- a/youtube_dl/extractor/thestar.py +++ b/youtube_dl/extractor/thestar.py @@ -2,8 +2,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE -from ..compat import compat_parse_qs class TheStarIE(InfoExtractor): @@ -30,6 +28,9 @@ class TheStarIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + brightcove_id = self._search_regex( + r'mainartBrightcoveVideoId["\']?\s*:\s*["\']?(\d+)', + webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/theweatherchannel.py b/youtube_dl/extractor/theweatherchannel.py new file mode 100644 index 000000000..c34a49d03 --- /dev/null +++ b/youtube_dl/extractor/theweatherchannel.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .theplatform import ThePlatformIE +from ..utils import ( + determine_ext, + parse_duration, +) + + +class TheWeatherChannelIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?weather\.com/(?:[^/]+/)*video/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', + 'md5': 'ab924ac9574e79689c24c6b95e957def', + 'info_dict': { + 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', + 'ext': 'mp4', + 'title': 'Ice Climber Is In For A Shock', + 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', + 'uploader': 'TWC - Digital (No Distro)', + 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + video_id = drupal_settings['twc']['contexts']['node']['uuid'] + video_data = self._download_json( + 'https://dsx.weather.com/cms/v4/asset-collection/en_US/' + video_id, video_id) + seo_meta = video_data.get('seometa', {}) + title = video_data.get('title') or seo_meta['title'] + + urls = [] + thumbnails = [] + formats = [] + for variant_id, variant_url in video_data.get('variants', []).items(): + variant_url = variant_url.strip() + if not variant_url or variant_url in urls: + continue + urls.append(variant_url) + ext = determine_ext(variant_url) + if ext == 'jpg': + thumbnails.append({ + 'url': variant_url, + 'id': variant_id, + }) + elif ThePlatformIE.suitable(variant_url): + tp_formats, _ = self._extract_theplatform_smil(variant_url, video_id) + formats.extend(tp_formats) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + variant_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=variant_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + variant_url, video_id, f4m_id=variant_id, fatal=False)) + else: + formats.append({ + 'url': variant_url, + 'format_id': variant_id, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video_data.get('description') or seo_meta.get('description') or seo_meta.get('og:description'), + 'duration': parse_duration(video_data.get('duration')), + 'uploader': video_data.get('providername'), + 'uploader_id': video_data.get('providerid'), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/thisamericanlife.py b/youtube_dl/extractor/thisamericanlife.py index 36493a5de..91e45f2c3 100644 --- a/youtube_dl/extractor/thisamericanlife.py +++ b/youtube_dl/extractor/thisamericanlife.py @@ -13,7 +13,7 @@ class ThisAmericanLifeIE(InfoExtractor): 'ext': 'm4a', 'title': '487: Harper High School, Part One', 'description': 'md5:ee40bdf3fb96174a9027f76dbecea655', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { 'url': 'http://www.thisamericanlife.org/play_full.php?play=487', diff --git a/youtube_dl/extractor/thisav.py b/youtube_dl/extractor/thisav.py index 7f323c938..4473a3c77 100644 --- a/youtube_dl/extractor/thisav.py +++ b/youtube_dl/extractor/thisav.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..utils import determine_ext +from .jwplatform import JWPlatformBaseIE +from ..utils import remove_end -class ThisAVIE(InfoExtractor): +class ThisAVIE(JWPlatformBaseIE): _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*' - _TEST = { + _TESTS = [{ 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html', 'md5': '0480f1ef3932d901f0e0e719f188f19b', 'info_dict': { @@ -19,29 +19,49 @@ class ThisAVIE(InfoExtractor): 'uploader': 'dj7970', 'uploader_id': 'dj7970' } - } + }, { + 'url': 'http://www.thisav.com/video/242352/nerdy-18yo-big-ass-tattoos-and-glasses.html', + 'md5': 'ba90c076bd0f80203679e5b60bf523ee', + 'info_dict': { + 'id': '242352', + 'ext': 'mp4', + 'title': 'Nerdy 18yo Big Ass Tattoos and Glasses', + 'uploader': 'cybersluts', + 'uploader_id': 'cybersluts', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, 'title') + title = remove_end(self._html_search_regex( + r'<title>([^<]+)', webpage, 'title'), + ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') video_url = self._html_search_regex( - r"addVariable\('file','([^']+)'\);", webpage, 'video url') + r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None) + if video_url: + info_dict = { + 'formats': [{ + 'url': video_url, + }], + } + else: + info_dict = self._extract_jwplayer_data( + webpage, video_id, require_title=False) uploader = self._html_search_regex( r': ([^<]+)', webpage, 'uploader name', fatal=False) uploader_id = self._html_search_regex( r': (?:[^<]+)', webpage, 'uploader id', fatal=False) - ext = determine_ext(video_url) - return { + info_dict.update({ 'id': video_id, - 'url': video_url, 'uploader': uploader, 'uploader_id': uploader_id, 'title': title, - 'ext': ext, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/thisoldhouse.py b/youtube_dl/extractor/thisoldhouse.py new file mode 100644 index 000000000..197258df1 --- /dev/null +++ b/youtube_dl/extractor/thisoldhouse.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ThisOldHouseIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', + 'md5': '946f05bbaa12a33f9ae35580d2dfcfe3', + 'info_dict': { + 'id': '2REGtUDQ', + 'ext': 'mp4', + 'title': 'How to Build a Storage Bench', + 'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.', + 'timestamp': 1442548800, + 'upload_date': '20150918', + } + }, { + 'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins', + 'only_matching': True, + }, { + 'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + video_id = drupal_settings['jwplatform']['video_id'] + return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id) diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py deleted file mode 100644 index 406f4a826..000000000 --- a/youtube_dl/extractor/thvideo.py +++ /dev/null @@ -1,84 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - unified_strdate -) - - -class THVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P[0-9]+)' - _TEST = { - 'url': 'http://thvideo.tv/v/th1987/', - 'md5': 'fa107b1f73817e325e9433505a70db50', - 'info_dict': { - 'id': '1987', - 'ext': 'mp4', - 'title': '【动画】秘封活动记录 ~ The Sealed Esoteric History.分镜稿预览', - 'display_id': 'th1987', - 'thumbnail': 'http://thvideo.tv/uploadfile/2014/0722/20140722013459856.jpg', - 'description': '社团京都幻想剧团的第一个东方二次同人动画作品「秘封活动记录 ~ The Sealed Esoteric History.」 本视频是该动画第一期的分镜草稿...', - 'upload_date': '20140722' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # extract download link from mobile player page - webpage_player = self._download_webpage( - 'http://thvideo.tv/mobile.php?cid=%s-0' % (video_id), - video_id, note='Downloading video source page') - video_url = self._html_search_regex( - r'', webpage, - 'upload date', fatal=False)) - - return { - 'id': video_id, - 'ext': 'mp4', - 'url': video_url, - 'title': title, - 'display_id': display_id, - 'thumbnail': thumbnail, - 'description': description, - 'upload_date': upload_date - } - - -class THVideoPlaylistIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P[0-9]+)' - _TEST = { - 'url': 'http://thvideo.tv/mylist2', - 'info_dict': { - 'id': '2', - 'title': '幻想万華鏡', - }, - 'playlist_mincount': 23, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - list_title = self._html_search_regex( - r'

(.*?)[\da-z]+)"\);\n' - '\s+fo\.addVariable\("s",\s"(?P\d+)"\);', webpage) + r'\s+fo\.addVariable\("s",\s"(?P\d+)"\);', webpage) if mobj is None: raise ExtractorError('Video %s does not exist' % video_id, expected=True) diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index abad3ff64..fd145ba42 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -1,15 +1,19 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals + import re from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE -from ..compat import compat_parse_qs +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) class TlcDeIE(InfoExtractor): IE_NAME = 'tlc.de' - _VALID_URL = r'https?://www\.tlc\.de/(?:[^/]+/)*videos/(?P[^/?#]+)?(?:.*#(?P<id>\d+))?' + _VALID_URL = r'https?://(?:www\.)?tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' _TEST = { 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', @@ -35,5 +39,5 @@ class TlcDeIE(InfoExtractor): title = mobj.group('title') webpage = self._download_webpage(url, title) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] + brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py index 979856e9a..419f9d92e 100644 --- a/youtube_dl/extractor/tmz.py +++ b/youtube_dl/extractor/tmz.py @@ -32,12 +32,15 @@ class TMZArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/]+)/?' _TEST = { 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', - 'md5': 'e482a414a38db73087450e3a6ce69d00', + 'md5': '3316ff838ae5bb7f642537825e1e90d2', 'info_dict': { 'id': '0_6snoelag', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', + 'timestamp': 1429467813, + 'upload_date': '20150419', + 'uploader_id': 'batchUser', } } @@ -45,12 +48,9 @@ class TMZArticleIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - embedded_video_info_str = self._html_search_regex( - r'tmzVideoEmbedV2\("([^)]+)"\);', webpage, 'embedded video info') - - embedded_video_info = self._parse_json( - embedded_video_info_str, video_id, - transform_source=lambda s: s.replace('\\', '')) + embedded_video_info = self._parse_json(self._html_search_regex( + r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'), + video_id) return self.url_result( 'http://www.tmz.com/videos/%s/' % embedded_video_info['id']) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 78174178e..7e6ec3430 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, parse_duration, str_to_int, + unescapeHTML, xpath_text, ) @@ -80,7 +81,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): if not cfg_url: inputs = self._hidden_inputs(webpage) - cfg_url = 'https://cdn-fck.tnaflix.com/tnaflix/%s.fid?key=%s' % (inputs['vkey'], inputs['nkey']) + cfg_url = ('https://cdn-fck.tnaflix.com/tnaflix/%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' + % (inputs['vkey'], inputs['nkey'], video_id)) cfg_xml = self._download_xml( cfg_url, display_id, 'Downloading metadata', @@ -89,7 +91,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): formats = [] def extract_video_url(vl): - return re.sub('speed=\d+', 'speed=', vl.text) + return re.sub(r'speed=\d+', 'speed=', unescapeHTML(vl.text)) video_link = cfg_xml.find('./videoLink') if video_link is not None: @@ -118,8 +120,12 @@ class TNAFlixNetworkBaseIE(InfoExtractor): xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') thumbnails = self._extract_thumbnails(cfg_xml) - title = self._html_search_regex( - self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) + title = None + if self._TITLE_REGEX: + title = self._html_search_regex( + self._TITLE_REGEX, webpage, 'title', default=None) + if not title: + title = self._og_search_title(webpage) age_limit = self._rta_search(webpage) or 18 @@ -168,7 +174,7 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): 'display_id': '6538', 'ext': 'mp4', 'title': 'Educational xxx video', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, }, 'params': { @@ -189,21 +195,21 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): class TNAFlixIE(TNAFlixNetworkBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' - _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos' - _DESCRIPTION_REGEX = r']+name="description"[^>]+content="([^"]+)"' - _UPLOADER_REGEX = r'\s*Verified Member\s*\s*

(.+?)

' + _TITLE_REGEX = r'(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)' + _DESCRIPTION_REGEX = r'(?s)>Description:]+>(.+?)<' + _UPLOADER_REGEX = r'\s*Verified Member\s*\s*(.+?)<' _CATEGORIES_REGEX = r'(?s)]*>Categories:(.+?)

' _TESTS = [{ # anonymous uploader, no categories 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': '7e569419fe6d69543d01e6be22f5f7c4', + 'md5': 'ecf3498417d09216374fc5907f9c6ec0', 'info_dict': { 'id': '553878', 'display_id': 'Carmella-Decesare-striptease', 'ext': 'mp4', 'title': 'Carmella Decesare - striptease', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'duration': 91, 'age_limit': 18, 'categories': ['Porn Stars'], @@ -211,14 +217,14 @@ class TNAFlixIE(TNAFlixNetworkBaseIE): }, { # non-anonymous uploader, categories 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', - 'md5': 'fcba2636572895aba116171a899a5658', + 'md5': '0f5d4d490dbfd117b8607054248a07c0', 'info_dict': { 'id': '6538', 'display_id': 'Educational-xxx-video', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Educational xxx video', 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'duration': 164, 'age_limit': 18, 'uploader': 'bobwhite39', @@ -244,7 +250,7 @@ class EMPFlixIE(TNAFlixNetworkBaseIE): 'ext': 'mp4', 'title': 'Amateur Finger Fuck', 'description': 'Amateur solo finger fucking.', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'duration': 83, 'age_limit': 18, 'uploader': 'cwbike', @@ -274,7 +280,7 @@ class MovieFapIE(TNAFlixNetworkBaseIE): 'ext': 'mp4', 'title': 'Experienced MILF Amazing Handjob', 'description': 'Experienced MILF giving an Amazing Handjob', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, 'uploader': 'darvinfred06', 'view_count': int, @@ -292,7 +298,7 @@ class MovieFapIE(TNAFlixNetworkBaseIE): 'ext': 'flv', 'title': 'Jeune Couple Russe', 'description': 'Amateur', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, 'uploader': 'whiskeyjar', 'view_count': int, diff --git a/youtube_dl/extractor/tonline.py b/youtube_dl/extractor/tonline.py new file mode 100644 index 000000000..cc11eae2a --- /dev/null +++ b/youtube_dl/extractor/tonline.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TOnlineIE(InfoExtractor): + IE_NAME = 't-online.de' + _VALID_URL = r'https?://(?:www\.)?t-online\.de/tv/(?:[^/]+/)*id_(?P\d+)' + _TEST = { + 'url': 'http://www.t-online.de/tv/sport/fussball/id_79166266/drittes-remis-zidane-es-muss-etwas-passieren-.html', + 'md5': '7d94dbdde5f9d77c5accc73c39632c29', + 'info_dict': { + 'id': '79166266', + 'ext': 'mp4', + 'title': 'Drittes Remis! Zidane: "Es muss etwas passieren"', + 'description': 'Es läuft nicht rund bei Real Madrid. Das 1:1 gegen den SD Eibar war das dritte Unentschieden in Folge in der Liga.', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'http://www.t-online.de/tv/id_%s/tid_json_video' % video_id, video_id) + title = video_data['subtitle'] + + formats = [] + for asset in video_data.get('assets', []): + asset_source = asset.get('source') or asset.get('source2') + if not asset_source: + continue + formats_id = [] + for field_key in ('type', 'profile'): + field_value = asset.get(field_key) + if field_value: + formats_id.append(field_value) + formats.append({ + 'format_id': '-'.join(formats_id), + 'url': asset_source, + }) + + thumbnails = [] + for image in video_data.get('images', []): + image_source = image.get('source') + if not image_source: + continue + thumbnails.append({ + 'url': image_source, + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 54c2d0aa6..26d770992 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -2,14 +2,24 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + js_to_json, + ExtractorError, + urlencode_postdata, + extract_attributes, + smuggle_url, +) class TouTvIE(InfoExtractor): + _NETRC_MACHINE = 'toutv' IE_NAME = 'tou.tv' - _VALID_URL = r'https?://ici\.tou\.tv/(?P[a-zA-Z0-9_-]+/S[0-9]+E[0-9]+)' + _VALID_URL = r'https?://ici\.tou\.tv/(?P[a-zA-Z0-9_-]+(?:/S[0-9]+E[0-9]+)?)' + _access_token = None + _claims = None - _TEST = { + _TESTS = [{ 'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17', 'info_dict': { 'id': '122017', @@ -22,18 +32,67 @@ class TouTvIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - } + 'skip': '404 Not Found', + }, { + 'url': 'http://ici.tou.tv/hackers', + 'only_matching': True, + }] + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + state = 'http://ici.tou.tv//' + webpage = self._download_webpage(state, None, 'Downloading homepage') + toutvlogin = self._parse_json(self._search_regex( + r'(?s)toutvlogin\s*=\s*({.+?});', webpage, 'toutvlogin'), None, js_to_json) + authorize_url = toutvlogin['host'] + '/auth/oauth/v2/authorize' + login_webpage = self._download_webpage( + authorize_url, None, 'Downloading login page', query={ + 'client_id': toutvlogin['clientId'], + 'redirect_uri': 'https://ici.tou.tv/login/loginCallback', + 'response_type': 'token', + 'scope': 'media-drmt openid profile email id.write media-validation.read.privileged', + 'state': state, + }) + login_form = self._search_regex( + r'(?s)(]+(?:id|name)="Form-login".+?)', login_webpage, 'login form') + form_data = self._hidden_inputs(login_form) + form_data.update({ + 'login-email': email, + 'login-password': password, + }) + post_url = extract_attributes(login_form).get('action') or authorize_url + _, urlh = self._download_webpage_handle( + post_url, None, 'Logging in', data=urlencode_postdata(form_data)) + self._access_token = self._search_regex( + r'access_token=([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + urlh.geturl(), 'access token') + self._claims = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/getClaims', + None, 'Extracting Claims', query={ + 'token': self._access_token, + 'access_token': self._access_token, + })['claims'] def _real_extract(self, url): path = self._match_id(url) metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path) + if metadata.get('IsDrm'): + raise ExtractorError('This video is DRM protected.', expected=True) video_id = metadata['IdMedia'] details = metadata['Details'] title = details['OriginalTitle'] + video_url = 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id) + if self._access_token and self._claims: + video_url = smuggle_url(video_url, { + 'access_token': self._access_token, + 'claims': self._claims, + }) return { '_type': 'url_transparent', - 'url': 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id), + 'url': video_url, 'id': video_id, 'title': title, 'thumbnail': details.get('ImageUrl'), diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 2579ba8c6..938e05076 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -1,4 +1,4 @@ -# -*- coding:utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/trollvids.py b/youtube_dl/extractor/trollvids.py deleted file mode 100644 index 657705623..000000000 --- a/youtube_dl/extractor/trollvids.py +++ /dev/null @@ -1,36 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .nuevo import NuevoBaseIE - - -class TrollvidsIE(NuevoBaseIE): - _VALID_URL = r'https?://(?:www\.)?trollvids\.com/video/(?P\d+)/(?P[^/?#&]+)' - IE_NAME = 'trollvids' - _TEST = { - 'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff', - 'md5': '1d53866b2c514b23ed69e4352fdc9839', - 'info_dict': { - 'id': '2349002', - 'ext': 'mp4', - 'title': '【MMD R-18】ガールフレンド carry_me_off', - 'age_limit': 18, - 'duration': 216.78, - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - info = self._extract_nuevo( - 'http://trollvids.com/nuevo/player/config.php?v=%s' % video_id, - video_id) - info.update({ - 'display_id': display_id, - 'age_limit': 18 - }) - return info diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py deleted file mode 100644 index d55e0c563..000000000 --- a/youtube_dl/extractor/trutube.py +++ /dev/null @@ -1,26 +0,0 @@ -from __future__ import unicode_literals - -from .nuevo import NuevoBaseIE - - -class TruTubeIE(NuevoBaseIE): - _VALID_URL = r'https?://(?:www\.)?trutube\.tv/(?:video/|nuevo/player/embed\.php\?v=)(?P\d+)' - _TESTS = [{ - 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-', - 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1', - 'info_dict': { - 'id': '14880', - 'ext': 'flv', - 'title': 'Ramses II - Proven To Be A Red Headed Caucasoid', - 'thumbnail': 're:^http:.*\.jpg$', - } - }, { - 'url': 'https://trutube.tv/nuevo/player/embed.php?v=14880', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._extract_nuevo( - 'https://trutube.tv/nuevo/player/config.php?v=%s' % video_id, - video_id) diff --git a/youtube_dl/extractor/trutv.py b/youtube_dl/extractor/trutv.py new file mode 100644 index 000000000..3a5782525 --- /dev/null +++ b/youtube_dl/extractor/trutv.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE + + +class TruTVIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?trutv\.com(?:(?P/shows/[^/]+/videos/[^/?#]+?)\.html|/full-episodes/[^/]+/(?P\d+))' + _TEST = { + 'url': 'http://www.trutv.com/shows/10-things/videos/you-wont-believe-these-sports-bets.html', + 'md5': '2cdc844f317579fed1a7251b087ff417', + 'info_dict': { + 'id': '/shows/10-things/videos/you-wont-believe-these-sports-bets', + 'ext': 'mp4', + 'title': 'You Won\'t Believe These Sports Bets', + 'description': 'Jamie Lee sits down with a bookie to discuss the bizarre world of illegal sports betting.', + 'upload_date': '20130305', + } + } + + def _real_extract(self, url): + path, video_id = re.match(self._VALID_URL, url).groups() + auth_required = False + if path: + data_src = 'http://www.trutv.com/video/cvp/v2/xml/content.xml?id=%s.xml' % path + else: + webpage = self._download_webpage(url, video_id) + video_id = self._search_regex( + r"TTV\.TVE\.episodeId\s*=\s*'([^']+)';", + webpage, 'video id', default=video_id) + auth_required = self._search_regex( + r'TTV\.TVE\.authRequired\s*=\s*(true|false);', + webpage, 'auth required', default='false') == 'true' + data_src = 'http://www.trutv.com/tveverywhere/services/cvpXML.do?titleId=' + video_id + return self._extract_cvp_info( + data_src, path, { + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/trutv/big', + 'tokenizer_src': 'http://www.trutv.com/tveverywhere/processors/services/token_ipadAdobe.do', + }, + }, { + 'url': url, + 'site_name': 'truTV', + 'auth_required': auth_required, + }) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 1d9271d1e..1853a1104 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -2,17 +2,14 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, - sanitized_Request, str_to_int, ) -from ..aes import aes_decrypt_text +from .keezmovies import KeezMoviesIE -class Tube8IE(InfoExtractor): +class Tube8IE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P[^/]+)/(?P\d+)' _TESTS = [{ 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', @@ -26,54 +23,26 @@ class Tube8IE(InfoExtractor): 'title': 'Kasia music video', 'age_limit': 18, 'duration': 230, - } + 'categories': ['Teen'], + 'tags': ['dancing'], + }, }, { 'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/', 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + webpage, info = self._extract_info(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, display_id) + if not info['title']: + info['title'] = self._html_search_regex( + r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') - flashvars = self._parse_json( - self._search_regex( - r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'), - video_id) - - formats = [] - for key, video_url in flashvars.items(): - if not isinstance(video_url, compat_str) or not video_url.startswith('http'): - continue - height = self._search_regex( - r'quality_(\d+)[pP]', key, 'height', default=None) - if not height: - continue - if flashvars.get('encrypted') is True: - video_url = aes_decrypt_text( - video_url, flashvars['video_title'], 32).decode('utf-8') - formats.append({ - 'url': video_url, - 'format_id': '%sp' % height, - 'height': int(height), - }) - self._sort_formats(formats) - - thumbnail = flashvars.get('image_url') - - title = self._html_search_regex( - r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') description = self._html_search_regex( r'>Description:
\s*(.+?)\s*<', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'\s*(.+?)\s*<', webpage, 'uploader', fatal=False) - duration = int_or_none(flashvars.get('video_duration')) like_count = int_or_none(self._search_regex( r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False)) @@ -86,18 +55,26 @@ class Tube8IE(InfoExtractor): r'(\d+)', webpage, 'comment count', fatal=False)) - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, + category = self._search_regex( + r'Category:\s*
\s*]+href=[^>]+>([^<]+)', + webpage, 'category', fatal=False) + categories = [category] if category else None + + tags_str = self._search_regex( + r'(?s)Tags:\s*(.+?)]+href=[^>]+>([^<]+)', tags_str)] if tags_str else None + + info.update({ 'description': description, - 'thumbnail': thumbnail, 'uploader': uploader, - 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - 'age_limit': 18, - 'formats': formats, - } + 'categories': categories, + 'tags': tags, + }) + + return info diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index c6572defb..3a37df2e8 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -9,7 +9,6 @@ from ..utils import ( int_or_none, sanitized_Request, urlencode_postdata, - parse_iso8601, ) @@ -19,17 +18,13 @@ class TubiTvIE(InfoExtractor): _NETRC_MACHINE = 'tubitv' _TEST = { 'url': 'http://tubitv.com/video/283829/the_comedian_at_the_friday', + 'md5': '43ac06be9326f41912dc64ccf7a80320', 'info_dict': { 'id': '283829', 'ext': 'mp4', 'title': 'The Comedian at The Friday', 'description': 'A stand up comedian is forced to look at the decisions in his life while on a one week trip to the west coast.', - 'uploader': 'Indie Rights Films', - 'upload_date': '20160111', - 'timestamp': 1452555979, - }, - 'params': { - 'skip_download': 'HLS download', + 'uploader_id': 'bc168bee0d18dd1cb3b86c68706ab434', }, } @@ -58,19 +53,28 @@ class TubiTvIE(InfoExtractor): video_id = self._match_id(url) video_data = self._download_json( 'http://tubitv.com/oz/videos/%s/content' % video_id, video_id) - title = video_data['n'] + title = video_data['title'] formats = self._extract_m3u8_formats( - video_data['mh'], video_id, 'mp4', 'm3u8_native') + self._proto_relative_url(video_data['url']), + video_id, 'mp4', 'm3u8_native') self._sort_formats(formats) + thumbnails = [] + for thumbnail_url in video_data.get('thumbnails', []): + if not thumbnail_url: + continue + thumbnails.append({ + 'url': self._proto_relative_url(thumbnail_url), + }) + subtitles = {} - for sub in video_data.get('sb', []): - sub_url = sub.get('u') + for sub in video_data.get('subtitles', []): + sub_url = sub.get('url') if not sub_url: continue - subtitles.setdefault(sub.get('l', 'en'), []).append({ - 'url': sub_url, + subtitles.setdefault(sub.get('lang', 'English'), []).append({ + 'url': self._proto_relative_url(sub_url), }) return { @@ -78,9 +82,8 @@ class TubiTvIE(InfoExtractor): 'title': title, 'formats': formats, 'subtitles': subtitles, - 'thumbnail': video_data.get('ph'), - 'description': video_data.get('d'), - 'duration': int_or_none(video_data.get('s')), - 'timestamp': parse_iso8601(video_data.get('u')), - 'uploader': video_data.get('on'), + 'thumbnails': thumbnails, + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'uploader_id': video_data.get('publisher_id'), } diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index bb8b8e234..2aae55e7e 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -23,7 +23,7 @@ class TudouIE(InfoExtractor): 'id': '159448201', 'ext': 'f4v', 'title': '卡马乔国足开大脚长传冲吊集锦', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1372113489000, 'description': '卡马乔卡家军,开大脚先进战术不完全集锦!', 'duration': 289.04, @@ -36,7 +36,7 @@ class TudouIE(InfoExtractor): 'id': '117049447', 'ext': 'f4v', 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1349207518000, 'description': 'md5:294612423894260f2dcd5c6c04fe248b', 'duration': 5478.33, diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 4d8b57111..786143525 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -17,7 +17,7 @@ class TumblrIE(InfoExtractor): 'ext': 'mp4', 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', 'description': 'md5:37db8211e40b50c7c44e95da14f630b7', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', } }, { 'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all', @@ -27,7 +27,7 @@ class TumblrIE(InfoExtractor): 'ext': 'mp4', 'title': '5SOS STRUM ;]', 'description': 'md5:dba62ac8639482759c8eb10ce474586a', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', } }, { 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video', @@ -37,7 +37,7 @@ class TumblrIE(InfoExtractor): 'ext': 'mp4', 'title': 'HD Video Testing \u2014 Test description for my HD video', 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, 'params': { 'format': 'hd', @@ -92,7 +92,7 @@ class TumblrIE(InfoExtractor): 'title': 'Video by victoriassecret', 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat', 'uploader_id': 'victoriassecret', - 'thumbnail': 're:^https?://.*\.jpg' + 'thumbnail': r're:^https?://.*\.jpg' }, 'add_ie': ['Instagram'], }] diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index ae4cfaec2..7e51de89e 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -11,6 +11,12 @@ from ..compat import compat_urlparse class TuneInBaseIE(InfoExtractor): _API_BASE_URL = 'http://tunein.com/tuner/tune/' + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+src=["\'](?P(?:https?://)?tunein\.com/embed/player/[pst]\d+)', + webpage) + def _real_extract(self, url): content_id = self._match_id(url) @@ -69,82 +75,83 @@ class TuneInClipIE(TuneInBaseIE): _VALID_URL = r'https?://(?:www\.)?tunein\.com/station/.*?audioClipId\=(?P\d+)' _API_URL_QUERY = '?tuneType=AudioClip&audioclipId=%s' - _TESTS = [ - { - 'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816', - 'md5': '99f00d772db70efc804385c6b47f4e77', - 'info_dict': { - 'id': '816', - 'title': '32m', - 'ext': 'mp3', - }, + _TESTS = [{ + 'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816', + 'md5': '99f00d772db70efc804385c6b47f4e77', + 'info_dict': { + 'id': '816', + 'title': '32m', + 'ext': 'mp3', }, - ] + }] class TuneInStationIE(TuneInBaseIE): IE_NAME = 'tunein:station' - _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId\=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId=|embed/player/s)(?P\d+)' _API_URL_QUERY = '?tuneType=Station&stationId=%s' @classmethod def suitable(cls, url): return False if TuneInClipIE.suitable(url) else super(TuneInStationIE, cls).suitable(url) - _TESTS = [ - { - 'url': 'http://tunein.com/radio/Jazz24-885-s34682/', - 'info_dict': { - 'id': '34682', - 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', - 'ext': 'mp3', - 'location': 'Tacoma, WA', - }, - 'params': { - 'skip_download': True, # live stream - }, + _TESTS = [{ + 'url': 'http://tunein.com/radio/Jazz24-885-s34682/', + 'info_dict': { + 'id': '34682', + 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', + 'ext': 'mp3', + 'location': 'Tacoma, WA', }, - ] + 'params': { + 'skip_download': True, # live stream + }, + }, { + 'url': 'http://tunein.com/embed/player/s6404/', + 'only_matching': True, + }] class TuneInProgramIE(TuneInBaseIE): IE_NAME = 'tunein:program' - _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId\=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId=|embed/player/p)(?P\d+)' _API_URL_QUERY = '?tuneType=Program&programId=%s' - _TESTS = [ - { - 'url': 'http://tunein.com/radio/Jazz-24-p2506/', - 'info_dict': { - 'id': '2506', - 'title': 'Jazz 24 on 91.3 WUKY-HD3', - 'ext': 'mp3', - 'location': 'Lexington, KY', - }, - 'params': { - 'skip_download': True, # live stream - }, + _TESTS = [{ + 'url': 'http://tunein.com/radio/Jazz-24-p2506/', + 'info_dict': { + 'id': '2506', + 'title': 'Jazz 24 on 91.3 WUKY-HD3', + 'ext': 'mp3', + 'location': 'Lexington, KY', }, - ] + 'params': { + 'skip_download': True, # live stream + }, + }, { + 'url': 'http://tunein.com/embed/player/p191660/', + 'only_matching': True, + }] class TuneInTopicIE(TuneInBaseIE): IE_NAME = 'tunein:topic' - _VALID_URL = r'https?://(?:www\.)?tunein\.com/topic/.*?TopicId\=(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:topic/.*?TopicId=|embed/player/t)(?P\d+)' _API_URL_QUERY = '?tuneType=Topic&topicId=%s' - _TESTS = [ - { - 'url': 'http://tunein.com/topic/?TopicId=101830576', - 'md5': 'c31a39e6f988d188252eae7af0ef09c9', - 'info_dict': { - 'id': '101830576', - 'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)', - 'ext': 'mp3', - 'location': 'Belgium', - }, + _TESTS = [{ + 'url': 'http://tunein.com/topic/?TopicId=101830576', + 'md5': 'c31a39e6f988d188252eae7af0ef09c9', + 'info_dict': { + 'id': '101830576', + 'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)', + 'ext': 'mp3', + 'location': 'Belgium', }, - ] + }, { + 'url': 'http://tunein.com/embed/player/t101830576/', + 'only_matching': True, + }] class TuneInShortenerIE(InfoExtractor): diff --git a/youtube_dl/extractor/turbo.py b/youtube_dl/extractor/turbo.py index 7ae63a499..25aa9c58e 100644 --- a/youtube_dl/extractor/turbo.py +++ b/youtube_dl/extractor/turbo.py @@ -24,7 +24,7 @@ class TurboIE(InfoExtractor): 'duration': 3715, 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ', 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } } diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py new file mode 100644 index 000000000..57ffedb87 --- /dev/null +++ b/youtube_dl/extractor/turner.py @@ -0,0 +1,175 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..compat import compat_str +from ..utils import ( + xpath_text, + int_or_none, + determine_ext, + parse_duration, + xpath_attr, + update_url_query, + ExtractorError, +) + + +class TurnerBaseIE(AdobePassIE): + def _extract_timestamp(self, video_data): + return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) + + def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): + video_data = self._download_xml(data_src, video_id) + video_id = video_data.attrib['id'] + title = xpath_text(video_data, 'headline', fatal=True) + content_id = xpath_text(video_data, 'contentId') or video_id + # rtmp_src = xpath_text(video_data, 'akamai/src') + # if rtmp_src: + # splited_rtmp_src = rtmp_src.split(',') + # if len(splited_rtmp_src) == 2: + # rtmp_src = splited_rtmp_src[1] + # aifp = xpath_text(video_data, 'akamai/aifp', default='') + + tokens = {} + urls = [] + formats = [] + rex = re.compile( + r'(?P[0-9]+)x(?P[0-9]+)(?:_(?P[0-9]+))?') + # Possible formats locations: files/file, files/groupFiles/files + # and maybe others + for video_file in video_data.findall('.//file'): + video_url = video_file.text.strip() + if not video_url: + continue + ext = determine_ext(video_url) + if video_url.startswith('/mp4:protected/'): + continue + # TODO Correct extraction for these files + # protected_path_data = path_data.get('protected') + # if not protected_path_data or not rtmp_src: + # continue + # protected_path = self._search_regex( + # r'/mp4:(.+)\.[a-z0-9]', video_url, 'secure path') + # auth = self._download_webpage( + # protected_path_data['tokenizer_src'], query={ + # 'path': protected_path, + # 'videoId': content_id, + # 'aifp': aifp, + # }) + # token = xpath_text(auth, 'token') + # if not token: + # continue + # video_url = rtmp_src + video_url + '?' + token + elif video_url.startswith('/secure/'): + secure_path_data = path_data.get('secure') + if not secure_path_data: + continue + video_url = secure_path_data['media_src'] + video_url + secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' + token = tokens.get(secure_path) + if not token: + query = { + 'path': secure_path, + 'videoId': content_id, + } + if ap_data.get('auth_required'): + query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], video_id, ap_data['site_name'], ap_data['site_name']) + auth = self._download_xml( + secure_path_data['tokenizer_src'], video_id, query=query) + error_msg = xpath_text(auth, 'error/msg') + if error_msg: + raise ExtractorError(error_msg, expected=True) + token = xpath_text(auth, 'token') + if not token: + continue + tokens[secure_path] = token + video_url = video_url + '?hdnea=' + token + elif not re.match('https?://', video_url): + base_path_data = path_data.get(ext, path_data.get('default', {})) + media_src = base_path_data.get('media_src') + if not media_src: + continue + video_url = media_src + video_url + if video_url in urls: + continue + urls.append(video_url) + format_id = video_file.get('bitrate') + if ext == 'smil': + formats.extend(self._extract_smil_formats( + video_url, video_id, fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + m3u8_id=format_id or 'hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(video_url, {'hdcore': '3.7.0'}), + video_id, f4m_id=format_id or 'hds', fatal=False)) + else: + f = { + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + } + mobj = rex.search(format_id + video_url) + if mobj: + f.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + 'tbr': int_or_none(mobj.group('bitrate')), + }) + elif isinstance(format_id, compat_str): + if format_id.isdigit(): + f['tbr'] = int(format_id) + else: + mobj = re.match(r'ios_(audio|[0-9]+)$', format_id) + if mobj: + if mobj.group(1) == 'audio': + f.update({ + 'vcodec': 'none', + 'ext': 'm4a', + }) + else: + f['tbr'] = int(mobj.group(1)) + formats.append(f) + self._sort_formats(formats) + + subtitles = {} + for source in video_data.findall('closedCaptions/source'): + for track in source.findall('track'): + track_url = track.get('url') + if not isinstance(track_url, compat_str) or track_url.endswith('/big'): + continue + lang = track.get('lang') or track.get('label') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': track_url, + 'ext': { + 'scc': 'scc', + 'webvtt': 'vtt', + 'smptett': 'tt', + }.get(source.get('format')) + }) + + thumbnails = [{ + 'id': image.get('cut'), + 'url': image.text, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in video_data.findall('images/image')] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'description': xpath_text(video_data, 'description'), + 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), + 'timestamp': self._extract_timestamp(video_data), + 'upload_date': xpath_attr(video_data, 'metas', 'version'), + 'series': xpath_text(video_data, 'showTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + } diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index f225ec684..d5071e8a5 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -126,7 +126,7 @@ class TV2ArticleIE(InfoExtractor): if not assets: # New embed pattern - for v in re.findall('TV2ContentboxVideo\(({.+?})\)', webpage): + for v in re.findall(r'TV2ContentboxVideo\(({.+?})\)', webpage): video = self._parse_json( v, playlist_id, transform_source=js_to_json, fatal=False) if not video: diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 343edf206..ad79db92b 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -2,9 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - ExtractorError, + int_or_none, parse_iso8601, + try_get, + determine_ext, ) @@ -24,24 +27,24 @@ class TV4IE(InfoExtractor): _TESTS = [ { 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650', - 'md5': '909d6454b87b10a25aa04c4bdd416a9b', + 'md5': 'cb837212f342d77cec06e6dad190e96d', 'info_dict': { 'id': '2491650', 'ext': 'mp4', 'title': 'Kalla Fakta 5 (english subtitles)', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': int, 'upload_date': '20131125', }, }, { 'url': 'http://www.tv4play.se/iframe/video/3054113', - 'md5': '77f851c55139ffe0ebd41b6a5552489b', + 'md5': 'cb837212f342d77cec06e6dad190e96d', 'info_dict': { 'id': '3054113', 'ext': 'mp4', 'title': 'Så här jobbar ficktjuvarna - se avslöjande bilder', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'Unika bilder avslöjar hur turisternas fickor vittjas mitt på Stockholms central. Två experter på ficktjuvarna avslöjar knepen du ska se upp för.', 'timestamp': int, 'upload_date': '20150130', @@ -65,36 +68,61 @@ class TV4IE(InfoExtractor): video_id = self._match_id(url) info = self._download_json( - 'http://www.tv4play.se/player/assets/%s.json' % video_id, video_id, 'Downloading video info JSON') + 'http://www.tv4play.se/player/assets/%s.json' % video_id, + video_id, 'Downloading video info JSON') # If is_geo_restricted is true, it doesn't necessarily mean we can't download it - if info['is_geo_restricted']: + if info.get('is_geo_restricted'): self.report_warning('This content might not be available in your country due to licensing restrictions.') - if info['requires_subscription']: - raise ExtractorError('This content requires subscription.', expected=True) - sources_data = self._download_json( - 'https://prima.tv4play.se/api/web/asset/%s/play.json?protocol=http&videoFormat=MP4' % video_id, video_id, 'Downloading sources JSON') - sources = sources_data['playback'] + title = info['title'] + subtitles = {} formats = [] - for item in sources.get('items', {}).get('item', []): - ext, bitrate = item['mediaFormat'], item['bitrate'] - formats.append({ - 'format_id': '%s_%s' % (ext, bitrate), - 'tbr': bitrate, - 'ext': ext, - 'url': item['url'], - }) + # http formats are linked with unresolvable host + for kind in ('hls', ''): + data = self._download_json( + 'https://prima.tv4play.se/api/web/asset/%s/play.json' % video_id, + video_id, 'Downloading sources JSON', query={ + 'protocol': kind, + 'videoFormat': 'MP4+WEBVTT', + }) + items = try_get(data, lambda x: x['playback']['items']['item']) + if not items: + continue + if isinstance(items, dict): + items = [items] + for item in items: + manifest_url = item.get('url') + if not isinstance(manifest_url, compat_str): + continue + ext = determine_ext(manifest_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=kind, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_akamai_formats( + manifest_url, video_id, { + 'hls': 'tv4play-i.akamaihd.net', + })) + elif ext == 'webvtt': + subtitles = self._merge_subtitles( + subtitles, { + 'sv': [{ + 'url': manifest_url, + 'ext': 'vtt', + }]}) self._sort_formats(formats) return { 'id': video_id, - 'title': info['title'], + 'title': title, 'formats': formats, + 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), - 'duration': info.get('duration'), + 'duration': int_or_none(info.get('duration')), 'thumbnail': info.get('image'), - 'is_live': sources.get('live'), + 'is_live': info.get('is_live') is True, } diff --git a/youtube_dl/extractor/tva.py b/youtube_dl/extractor/tva.py new file mode 100644 index 000000000..3ced098f9 --- /dev/null +++ b/youtube_dl/extractor/tva.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + smuggle_url, +) + + +class TVAIE(InfoExtractor): + _VALID_URL = r'https?://videos\.tva\.ca/episode/(?P\d+)' + _TEST = { + 'url': 'http://videos.tva.ca/episode/85538', + 'info_dict': { + 'id': '85538', + 'ext': 'mp4', + 'title': 'Épisode du 25 janvier 2017', + 'description': 'md5:e9e7fb5532ab37984d2dc87229cadf98', + 'upload_date': '20170126', + 'timestamp': 1485442329, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + "https://d18jmrhziuoi7p.cloudfront.net/isl/api/v1/dataservice/Items('%s')" % video_id, + video_id, query={ + '$expand': 'Metadata,CustomId', + '$select': 'Metadata,Id,Title,ShortDescription,LongDescription,CreatedDate,CustomId,AverageUserRating,Categories,ShowName', + '$format': 'json', + }) + metadata = video_data.get('Metadata', {}) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': video_data['Title'], + 'url': smuggle_url('ooyala:' + video_data['CustomId'], {'supportedformats': 'm3u8,hds'}), + 'description': video_data.get('LongDescription') or video_data.get('ShortDescription'), + 'series': video_data.get('ShowName'), + 'episode': metadata.get('EpisodeTitle'), + 'episode_number': int_or_none(metadata.get('EpisodeNumber')), + 'categories': video_data.get('Categories'), + 'average_rating': video_data.get('AverageUserRating'), + 'timestamp': parse_iso8601(video_data.get('CreatedDate')), + 'ie_key': 'Ooyala', + } diff --git a/youtube_dl/extractor/tvanouvelles.py b/youtube_dl/extractor/tvanouvelles.py new file mode 100644 index 000000000..1086176a2 --- /dev/null +++ b/youtube_dl/extractor/tvanouvelles.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveNewIE + + +class TVANouvellesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tvanouvelles\.ca/videos/(?P\d+)' + _TEST = { + 'url': 'http://www.tvanouvelles.ca/videos/5117035533001', + 'info_dict': { + 'id': '5117035533001', + 'ext': 'mp4', + 'title': 'L’industrie du taxi dénonce l’entente entre Québec et Uber: explications', + 'description': 'md5:479653b7c8cf115747bf5118066bd8b3', + 'uploader_id': '1741764581', + 'timestamp': 1473352030, + 'upload_date': '20160908', + }, + 'add_ie': ['BrightcoveNew'], + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1741764581/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + brightcove_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + BrightcoveNewIE.ie_key(), brightcove_id) + + +class TVANouvellesArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tvanouvelles\.ca/(?:[^/]+/)+(?P[^/?#&]+)' + _TEST = { + 'url': 'http://www.tvanouvelles.ca/2016/11/17/des-policiers-qui-ont-la-meche-un-peu-courte', + 'info_dict': { + 'id': 'des-policiers-qui-ont-la-meche-un-peu-courte', + 'title': 'Des policiers qui ont «la mèche un peu courte»?', + 'description': 'md5:92d363c8eb0f0f030de9a4a84a90a3a0', + }, + 'playlist_mincount': 4, + } + + @classmethod + def suitable(cls, url): + return False if TVANouvellesIE.suitable(url) else super(TVANouvellesArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [ + self.url_result( + 'http://www.tvanouvelles.ca/videos/%s' % mobj.group('id'), + ie=TVANouvellesIE.ie_key(), video_id=mobj.group('id')) + for mobj in re.finditer( + r'data-video-id=(["\'])?(?P\d+)', webpage)] + + title = self._og_search_title(webpage, fatal=False) + description = self._og_search_description(webpage) + + return self.playlist_result(entries, display_id, title, description) diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index 4065354dd..008f64cc2 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -19,7 +19,7 @@ class TVCIE(InfoExtractor): 'id': '74622', 'ext': 'mp4', 'title': 'События. "События". Эфир от 22.05.2015 14:30', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1122, }, } @@ -72,7 +72,7 @@ class TVCArticleIE(InfoExtractor): 'ext': 'mp4', 'title': 'События. "События". Эфир от 22.05.2015 14:30', 'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1122, }, }, { @@ -82,7 +82,7 @@ class TVCArticleIE(InfoExtractor): 'ext': 'mp4', 'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках', 'description': 'md5:f2098f71e21f309e89f69b525fd9846e', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 278, }, }, { @@ -92,7 +92,7 @@ class TVCArticleIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ещё не поздно. Эфир от 03.08.2013', 'description': 'md5:51fae9f3f8cfe67abce014e428e5b027', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 3316, }, }] diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index ead4c00c7..f3817ab28 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/tvland.py b/youtube_dl/extractor/tvland.py index cb76a2a58..957cf1ea2 100644 --- a/youtube_dl/extractor/tvland.py +++ b/youtube_dl/extractor/tvland.py @@ -6,7 +6,7 @@ from .mtv import MTVServicesInfoExtractor class TVLandIE(MTVServicesInfoExtractor): IE_NAME = 'tvland.com' - _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|episodes)/(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|(?:full-)?episodes)/(?P[^/?#.]+)' _FEED_URL = 'http://www.tvland.com/feeds/mrss/' _TESTS = [{ # Geo-restricted. Without a proxy metadata are still there. With a @@ -28,4 +28,7 @@ class TVLandIE(MTVServicesInfoExtractor): 'upload_date': '20151228', 'timestamp': 1451289600, }, + }, { + 'url': 'http://www.tvland.com/full-episodes/iu0hz6/younger-a-kiss-is-just-a-kiss-season-3-ep-301', + 'only_matching': True, }] diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py new file mode 100644 index 000000000..6d5c74826 --- /dev/null +++ b/youtube_dl/extractor/tvnoe.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + clean_html, + get_element_by_class, + js_to_json, +) + + +class TVNoeIE(JWPlatformBaseIE): + _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.tvnoe.cz/video/10362', + 'md5': 'aee983f279aab96ec45ab6e2abb3c2ca', + 'info_dict': { + 'id': '10362', + 'ext': 'mp4', + 'series': 'Noční univerzita', + 'title': 'prof. Tomáš Halík, Th.D. - Návrat náboženství a střet civilizací', + 'description': 'md5:f337bae384e1a531a52c55ebc50fff41', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + iframe_url = self._search_regex( + r']+src="([^"]+)"', webpage, 'iframe URL') + + ifs_page = self._download_webpage(iframe_url, video_id) + jwplayer_data = self._parse_json( + self._find_jwplayer_data(ifs_page), + video_id, transform_source=js_to_json) + info_dict = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, base_url=iframe_url) + + info_dict.update({ + 'id': video_id, + 'title': clean_html(get_element_by_class( + 'field-name-field-podnazev', webpage)), + 'description': clean_html(get_element_by_class( + 'field-name-body', webpage)), + 'series': clean_html(get_element_by_class('title', webpage)) + }) + + return info_dict diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 2abfb7830..06ea2b40a 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -69,7 +69,8 @@ class TVPIE(InfoExtractor): webpage = self._download_webpage(url, page_id) video_id = self._search_regex([ r']+src="[^"]*?object_id=(\d+)', - "object_id\s*:\s*'(\d+)'"], webpage, 'video id') + r"object_id\s*:\s*'(\d+)'", + r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) return { '_type': 'url_transparent', 'url': 'tvp:' + video_id, @@ -138,6 +139,9 @@ class TVPEmbedIE(InfoExtractor): # formats.extend(self._extract_mpd_formats( # video_url_base + '.ism/video.mpd', # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_ism_formats( + video_url_base + '.ism/Manifest', + video_id, 'mss', fatal=False)) formats.extend(self._extract_f4m_formats( video_url_base + '.ism/video.f4m', video_id, f4m_id='hds', fatal=False)) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 150bde663..3eda0a399 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -15,21 +15,31 @@ from ..utils import ( int_or_none, parse_iso8601, qualities, + try_get, update_url_query, ) class TVPlayIE(InfoExtractor): - IE_DESC = 'TV3Play and related services' - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?:tvplay(?:\.skaties)?\.lv/parraides| - (?:tv3play|play\.tv3)\.lt/programos| - tv3play(?:\.tv3)?\.ee/sisu| - tv(?:3|6|8|10)play\.se/program| - (?:(?:tv3play|viasat4play|tv6play)\.no|tv3play\.dk)/programmer| - play\.novatv\.bg/programi - )/[^/]+/(?P\d+) - ''' + IE_NAME = 'mtg' + IE_DESC = 'MTG services' + _VALID_URL = r'''(?x) + (?: + mtg:| + https?:// + (?:www\.)? + (?: + tvplay(?:\.skaties)?\.lv/parraides| + (?:tv3play|play\.tv3)\.lt/programos| + tv3play(?:\.tv3)?\.ee/sisu| + (?:tv(?:3|6|8|10)play|viafree)\.se/program| + (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer| + play\.novatv\.bg/programi + ) + /(?:[^/]+/)+ + ) + (?P\d+) + ''' _TESTS = [ { 'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true', @@ -194,9 +204,22 @@ class TVPlayIE(InfoExtractor): 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', 'only_matching': True, }, + { + # views is null + 'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183', + 'only_matching': True, + }, { 'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true', 'only_matching': True, + }, + { + 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869', + 'only_matching': True, + }, + { + 'url': 'mtg:418113', + 'only_matching': True, } ] @@ -204,13 +227,13 @@ class TVPlayIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/%s' % video_id, video_id, 'Downloading video JSON') + 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') title = video['title'] try: streams = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, + 'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON') except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: @@ -289,8 +312,114 @@ class TVPlayIE(InfoExtractor): 'season_number': season_number, 'duration': int_or_none(video.get('duration')), 'timestamp': parse_iso8601(video.get('created_at')), - 'view_count': int_or_none(video.get('views', {}).get('total')), + 'view_count': try_get(video, lambda x: x['views']['total'], int), 'age_limit': int_or_none(video.get('age_limit', 0)), 'formats': formats, 'subtitles': subtitles, } + + +class ViafreeIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + viafree\. + (?: + (?:dk|no)/programmer| + se/program + ) + /(?:[^/]+/)+(?P[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', + 'info_dict': { + 'id': '395375', + 'ext': 'mp4', + 'title': 'Husräddarna S02E02', + 'description': 'md5:4db5c933e37db629b5a2f75dfb34829e', + 'series': 'Husräddarna', + 'season': 'Säsong 2', + 'season_number': 2, + 'duration': 2576, + 'timestamp': 1400596321, + 'upload_date': '20140520', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + # with relatedClips + 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1', + 'info_dict': { + 'id': '758770', + 'ext': 'mp4', + 'title': 'Sommaren med YouTube-stjärnorna S01E01', + 'description': 'md5:2bc69dce2c4bb48391e858539bbb0e3f', + 'series': 'Sommaren med YouTube-stjärnorna', + 'season': 'Säsong 1', + 'season_number': 1, + 'duration': 1326, + 'timestamp': 1470905572, + 'upload_date': '20160811', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + # Different og:image URL schema + 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2', + 'only_matching': True, + }, { + 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', + 'only_matching': True, + }, { + 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + r'(?s)window\.App\s*=\s*({.+?})\s*;\s*\d+)' + _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx|sex)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' - _TESTS = [ - { - 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'e09fc0901d9eaeedac872f154931deeb', - 'info_dict': { - 'id': '1044982', - 'ext': 'mp4', - 'title': 'Эротика каменного века', - 'description': 'Как смотрели порно в каменном веке.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'SUPERTELO', - 'duration': 31, - 'timestamp': 1275937857, - 'upload_date': '20100607', - 'age_limit': 18, - 'like_count': int, - 'dislike_count': int, - }, + _TESTS = [{ + 'url': 'http://www.24video.net/video/view/1044982', + 'md5': 'e09fc0901d9eaeedac872f154931deeb', + 'info_dict': { + 'id': '1044982', + 'ext': 'mp4', + 'title': 'Эротика каменного века', + 'description': 'Как смотрели порно в каменном веке.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'SUPERTELO', + 'duration': 31, + 'timestamp': 1275937857, + 'upload_date': '20100607', + 'age_limit': 18, + 'like_count': int, + 'dislike_count': int, }, - { - 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', - 'only_matching': True, - } - ] + }, { + 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', + 'only_matching': True, + }, { + 'url': 'http://www.24video.me/video/view/1044982', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.24video.net/video/view/%s' % video_id, video_id) + 'http://www.24video.sex/video/view/%s' % video_id, video_id) title = self._og_search_title(webpage) description = self._html_search_regex( @@ -64,16 +64,16 @@ class TwentyFourVideoIE(InfoExtractor): r'(\d+) просмотр', webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( - r'
(\d+) комментари', + r']+href="#tab-comments"[^>]*>(\d+) комментари', webpage, 'comment count', fatal=False)) # Sets some cookies self._download_xml( - r'http://www.24video.net/video/xml/%s?mode=init' % video_id, + r'http://www.24video.sex/video/xml/%s?mode=init' % video_id, video_id, 'Downloading init XML') video_xml = self._download_xml( - 'http://www.24video.net/video/xml/%s?mode=play' % video_id, + 'http://www.24video.sex/video/xml/%s?mode=play' % video_id, video_id, 'Downloading video XML') video = xpath_element(video_xml, './/video', 'video', fatal=True) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index b721ecb0a..4fd1aa4bf 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -4,91 +4,88 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + int_or_none, + try_get, +) class TwentyMinutenIE(InfoExtractor): IE_NAME = '20min' - _VALID_URL = r'https?://(?:www\.)?20min\.ch/(?:videotv/*\?.*\bvid=(?P\d+)|(?:[^/]+/)*(?P[^/#?]+))' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?20min\.ch/ + (?: + videotv/*\?.*?\bvid=| + videoplayer/videoplayer\.html\?.*?\bvideoId@ + ) + (?P\d+) + ''' _TESTS = [{ - # regular video 'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2', - 'md5': 'b52d6bc6ea6398e6a38f12cfd418149c', + 'md5': 'e7264320db31eed8c38364150c12496e', 'info_dict': { 'id': '469148', - 'ext': 'flv', - 'title': '85 000 Franken für 15 perfekte Minuten', - 'description': 'Was die Besucher vom Silvesterzauber erwarten können. (Video: Alice Grosjean/Murat Temel)', - 'thumbnail': 'http://thumbnails.20min-tv.ch/server063/469148/frame-72-469148.jpg' - } - }, { - # news article with video - 'url': 'http://www.20min.ch/schweiz/news/story/-Wir-muessen-mutig-nach-vorne-schauen--22050469', - 'md5': 'cd4cbb99b94130cff423e967cd275e5e', - 'info_dict': { - 'id': '469408', - 'display_id': '-Wir-muessen-mutig-nach-vorne-schauen--22050469', - 'ext': 'flv', - 'title': '«Wir müssen mutig nach vorne schauen»', - 'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.', - 'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg' - }, - 'skip': '"This video is no longer available" is shown both on the web page and in the downloaded file.', - }, { - # YouTube embed - 'url': 'http://www.20min.ch/ro/sports/football/story/Il-marque-une-bicyclette-de-plus-de-30-metres--21115184', - 'md5': 'cec64d59aa01c0ed9dbba9cf639dd82f', - 'info_dict': { - 'id': 'ivM7A7SpDOs', 'ext': 'mp4', - 'title': 'GOLAZO DE CHILENA DE JAVI GÓMEZ, FINALISTA AL BALÓN DE CLM 2016', - 'description': 'md5:903c92fbf2b2f66c09de514bc25e9f5a', - 'upload_date': '20160424', - 'uploader': 'RTVCM Castilla-La Mancha', - 'uploader_id': 'RTVCM', + 'title': '85 000 Franken für 15 perfekte Minuten', + 'thumbnail': r're:https?://.*\.jpg$', + }, + }, { + 'url': 'http://www.20min.ch/videoplayer/videoplayer.html?params=client@twentyDE|videoId@523629', + 'info_dict': { + 'id': '523629', + 'ext': 'mp4', + 'title': 'So kommen Sie bei Eis und Schnee sicher an', + 'description': 'md5:117c212f64b25e3d95747e5276863f7d', + 'thumbnail': r're:https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, }, - 'add_ie': ['Youtube'], }, { 'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738', 'only_matching': True, - }, { - 'url': 'http://www.20min.ch/ro/sortir/cinema/story/Grandir-au-bahut--c-est-dur-18927411', - 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r']+src=(["\'])(?P(?:https?://)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1', + webpage)] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + video_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + video = self._download_json( + 'http://api.20min.ch/video/%s/show' % video_id, + video_id)['content'] - youtube_url = self._html_search_regex( - r']+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', - webpage, 'YouTube embed URL', default=None) - if youtube_url is not None: - return self.url_result(youtube_url, 'Youtube') + title = video['title'] - title = self._html_search_regex( - r'

.*?(.+?)

', - webpage, 'title', default=None) - if not title: - title = remove_end(re.sub( - r'^20 [Mm]inuten.*? -', '', self._og_search_title(webpage)), ' - News') + formats = [{ + 'format_id': format_id, + 'url': 'http://podcast.20min-tv.ch/podcast/20min/%s%s.mp4' % (video_id, p), + 'quality': quality, + } for quality, (format_id, p) in enumerate([('sd', ''), ('hd', 'h')])] + self._sort_formats(formats) - if not video_id: - video_id = self._search_regex( - r'"file\d?"\s*,\s*\"(\d+)', webpage, 'video id') + description = video.get('lead') + thumbnail = video.get('thumbnail') - description = self._html_search_meta( - 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) + def extract_count(kind): + return try_get( + video, + lambda x: int_or_none(x['communityobject']['thumbs_%s' % kind])) + + like_count = extract_count('up') + dislike_count = extract_count('down') return { 'id': video_id, - 'display_id': display_id, - 'url': 'http://speed.20min-tv.ch/%sm.flv' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'formats': formats, } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 890f55180..1ca159a4d 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -7,6 +7,7 @@ import random from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_parse_qs, compat_str, compat_urllib_parse_urlencode, @@ -14,13 +15,14 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + clean_html, ExtractorError, int_or_none, js_to_json, orderedSet, parse_duration, parse_iso8601, - sanitized_Request, + update_url_query, urlencode_postdata, ) @@ -31,6 +33,7 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' _LOGIN_URL = 'http://www.twitch.tv/login' + _CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6' _NETRC_MACHINE = 'twitch' def _handle_error(self, response): @@ -42,16 +45,10 @@ class TwitchBaseIE(InfoExtractor): '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), expected=True) - def _download_json(self, url, video_id, note='Downloading JSON metadata'): - headers = { - 'Referer': 'http://api.twitch.tv/crossdomain/receiver.html?v=2', - 'X-Requested-With': 'XMLHttpRequest', - } - for cookie in self._downloader.cookiejar: - if cookie.name == 'api_token': - headers['Twitch-Api-Token'] = cookie.value - request = sanitized_Request(url, headers=headers) - response = super(TwitchBaseIE, self)._download_json(request, video_id, note) + def _call_api(self, path, item_id, note): + response = self._download_json( + '%s/%s' % (self._API_BASE, path), item_id, note, + headers={'Client-ID': self._CLIENT_ID}) self._handle_error(response) return response @@ -63,9 +60,17 @@ class TwitchBaseIE(InfoExtractor): if username is None: return + def fail(message): + raise ExtractorError( + 'Unable to login. Twitch said: %s' % message, expected=True) + login_page, handle = self._download_webpage_handle( self._LOGIN_URL, None, 'Downloading login page') + # Some TOR nodes and public proxies are blocked completely + if 'blacklist_message' in login_page: + fail(clean_html(login_page)) + login_form = self._hidden_inputs(login_page) login_form.update({ @@ -82,21 +87,24 @@ class TwitchBaseIE(InfoExtractor): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(redirect_url, post_url) - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Referer', redirect_url) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + headers = {'Referer': redirect_url} - error_message = self._search_regex( - r']+class="subwindow_notice"[^>]*>([^<]+)
', - response, 'error message', default=None) - if error_message: - raise ExtractorError( - 'Unable to login. Twitch said: %s' % error_message, expected=True) + try: + response = self._download_json( + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + response = self._parse_json( + e.cause.read().decode('utf-8'), None) + fail(response['message']) + raise - if '>Reset your password<' in response: - self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit') + if response.get('redirect'): + self._download_webpage( + response['redirect'], None, 'Downloading login redirect page', + headers=headers) def _prefer_source(self, formats): try: @@ -109,14 +117,14 @@ class TwitchBaseIE(InfoExtractor): class TwitchItemBaseIE(TwitchBaseIE): def _download_info(self, item, item_id): - return self._extract_info(self._download_json( - '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id, + return self._extract_info(self._call_api( + 'kraken/videos/%s%s' % (item, item_id), item_id, 'Downloading %s info JSON' % self._ITEM_TYPE)) def _extract_media(self, item_id): info = self._download_info(self._ITEM_SHORTCUT, item_id) - response = self._download_json( - '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id, + response = self._call_api( + 'api/videos/%s%s' % (self._ITEM_SHORTCUT, item_id), item_id, 'Downloading %s playlist JSON' % self._ITEM_TYPE) entries = [] chunks = response['chunks'] @@ -198,7 +206,14 @@ class TwitchChapterIE(TwitchItemBaseIE): class TwitchVodIE(TwitchItemBaseIE): IE_NAME = 'twitch:vod' - _VALID_URL = r'%s/[^/]+/v/(?P\d+)' % TwitchBaseIE._VALID_URL_BASE + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?twitch\.tv/(?:[^/]+/v|videos)/| + player\.twitch\.tv/\?.*?\bvideo=v + ) + (?P\d+) + ''' _ITEM_TYPE = 'vod' _ITEM_SHORTCUT = 'v' @@ -208,7 +223,7 @@ class TwitchVodIE(TwitchItemBaseIE): 'id': 'v6528877', 'ext': 'mp4', 'title': 'LCK Summer Split - Week 6 Day 1', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 17208, 'timestamp': 1435131709, 'upload_date': '20150624', @@ -228,7 +243,7 @@ class TwitchVodIE(TwitchItemBaseIE): 'id': 'v11230755', 'ext': 'mp4', 'title': 'Untitled Broadcast', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1638, 'timestamp': 1439746708, 'upload_date': '20150816', @@ -240,14 +255,21 @@ class TwitchVodIE(TwitchItemBaseIE): # m3u8 download 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', + }, { + 'url': 'http://player.twitch.tv/?t=5m10s&video=v6528877', + 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/videos/6528877', + 'only_matching': True, }] def _real_extract(self, url): item_id = self._match_id(url) info = self._download_info(self._ITEM_SHORTCUT, item_id) - access_token = self._download_json( - '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, + access_token = self._call_api( + 'api/vods/%s/access_token' % item_id, item_id, 'Downloading %s access token' % self._ITEM_TYPE) formats = self._extract_m3u8_formats( @@ -271,16 +293,28 @@ class TwitchVodIE(TwitchItemBaseIE): if 't' in query: info['start_time'] = parse_duration(query['t'][0]) + if info.get('timestamp') is not None: + info['subtitles'] = { + 'rechat': [{ + 'url': update_url_query( + 'https://rechat.twitch.tv/rechat-messages', { + 'video_id': 'v%s' % item_id, + 'start': info['timestamp'], + }), + 'ext': 'json', + }], + } + return info class TwitchPlaylistBaseIE(TwitchBaseIE): - _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE + _PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d' _PAGE_LIMIT = 100 def _extract_playlist(self, channel_id): - info = self._download_json( - '%s/kraken/channels/%s' % (self._API_BASE, channel_id), + info = self._call_api( + 'kraken/channels/%s' % channel_id, channel_id, 'Downloading channel info JSON') channel_name = info.get('display_name') or info.get('name') entries = [] @@ -289,10 +323,10 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): broken_paging_detected = False counter_override = None for counter in itertools.count(1): - response = self._download_json( - self._PLAYLIST_URL % (channel_id, offset, limit), + response = self._call_api( + self._PLAYLIST_PATH % (channel_id, offset, limit), channel_id, - 'Downloading %s videos JSON page %s' + 'Downloading %s JSON page %s' % (self._PLAYLIST_TYPE, counter_override or counter)) page_entries = self._extract_playlist_page(response) if not page_entries: @@ -342,19 +376,72 @@ class TwitchProfileIE(TwitchPlaylistBaseIE): } -class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): - IE_NAME = 'twitch:past_broadcasts' - _VALID_URL = r'%s/(?P[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true' - _PLAYLIST_TYPE = 'past broadcasts' +class TwitchVideosBaseIE(TwitchPlaylistBaseIE): + _VALID_URL_VIDEOS_BASE = r'%s/(?P[^/]+)/videos' % TwitchBaseIE._VALID_URL_BASE + _PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcast_type=' + + +class TwitchAllVideosIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:all' + _VALID_URL = r'%s/all' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive,upload,highlight' + _PLAYLIST_TYPE = 'all videos' _TEST = { - 'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts', + 'url': 'https://www.twitch.tv/spamfish/videos/all', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, - 'playlist_mincount': 54, + 'playlist_mincount': 869, + } + + +class TwitchUploadsIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:uploads' + _VALID_URL = r'%s/uploads' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'upload' + _PLAYLIST_TYPE = 'uploads' + + _TEST = { + 'url': 'https://www.twitch.tv/spamfish/videos/uploads', + 'info_dict': { + 'id': 'spamfish', + 'title': 'Spamfish', + }, + 'playlist_mincount': 0, + } + + +class TwitchPastBroadcastsIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:past-broadcasts' + _VALID_URL = r'%s/past-broadcasts' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive' + _PLAYLIST_TYPE = 'past broadcasts' + + _TEST = { + 'url': 'https://www.twitch.tv/spamfish/videos/past-broadcasts', + 'info_dict': { + 'id': 'spamfish', + 'title': 'Spamfish', + }, + 'playlist_mincount': 0, + } + + +class TwitchHighlightsIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:highlights' + _VALID_URL = r'%s/highlights' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'highlight' + _PLAYLIST_TYPE = 'highlights' + + _TEST = { + 'url': 'https://www.twitch.tv/spamfish/videos/highlights', + 'info_dict': { + 'id': 'spamfish', + 'title': 'Spamfish', + }, + 'playlist_mincount': 805, } @@ -389,15 +476,12 @@ class TwitchStreamIE(TwitchBaseIE): def _real_extract(self, url): channel_id = self._match_id(url) - stream = self._download_json( - '%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id, + stream = self._call_api( + 'kraken/streams/%s?stream_type=all' % channel_id, channel_id, 'Downloading stream JSON').get('stream') - # Fallback on profile extraction if stream is offline if not stream: - return self.url_result( - 'http://www.twitch.tv/%s/profile' % channel_id, - 'TwitchProfile', channel_id) + raise ExtractorError('%s is offline' % channel_id, expected=True) # Channel name may be typed if different case than the original channel name # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing @@ -405,13 +489,14 @@ class TwitchStreamIE(TwitchBaseIE): # JSON and fallback to lowercase if it's not available. channel_id = stream.get('channel', {}).get('name') or channel_id.lower() - access_token = self._download_json( - '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id, + access_token = self._call_api( + 'api/channels/%s/access_token' % channel_id, channel_id, 'Downloading channel access token') query = { 'allow_source': 'true', 'allow_audio_only': 'true', + 'allow_spectre': 'true', 'p': random.randint(1000000, 10000000), 'player': 'twitchweb', 'segment_preference': '4', @@ -468,7 +553,7 @@ class TwitchClipsIE(InfoExtractor): 'id': 'AggressiveCobraPoooound', 'ext': 'mp4', 'title': 'EA Play 2016 Live from the Novo Theatre', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'creator': 'EA', 'uploader': 'stereotype_', 'uploader_id': 'stereotype_', diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index b73842986..37e3bc412 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( determine_ext, float_or_none, @@ -13,6 +14,8 @@ from ..utils import ( ExtractorError, ) +from .periscope import PeriscopeIE + class TwitterBaseIE(InfoExtractor): def _get_vmap_video_url(self, vmap_url, video_id): @@ -22,7 +25,7 @@ class TwitterBaseIE(InfoExtractor): class TwitterCardIE(TwitterBaseIE): IE_NAME = 'twitter:card' - _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos/tweet)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -31,7 +34,7 @@ class TwitterCardIE(TwitterBaseIE): 'id': '560070183650213889', 'ext': 'mp4', 'title': 'Twitter Card', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 30.033, } }, @@ -42,18 +45,18 @@ class TwitterCardIE(TwitterBaseIE): 'id': '623160978427936768', 'ext': 'mp4', 'title': 'Twitter Card', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 80.155, }, }, { 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', - 'md5': 'd4724ffe6d2437886d004fa5de1043b3', + 'md5': 'b6d9683dd3f48e340ded81c0e917ad46', 'info_dict': { 'id': 'dq4Oj5quskI', 'ext': 'mp4', 'title': 'Ubuntu 11.10 Overview', - 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10...', + 'description': 'md5:a831e97fa384863d6e26ce48d1c43376', 'upload_date': '20111013', 'uploader': 'OMG! Ubuntu!', 'uploader_id': 'omgubuntu', @@ -79,8 +82,11 @@ class TwitterCardIE(TwitterBaseIE): 'id': '705235433198714880', 'ext': 'mp4', 'title': 'Twitter web player', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', }, + }, { + 'url': 'https://twitter.com/i/videos/752274308186120192', + 'only_matching': True, }, ] @@ -100,12 +106,17 @@ class TwitterCardIE(TwitterBaseIE): return self.url_result(iframe_url) config = self._parse_json(self._html_search_regex( - r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'), + r'data-(?:player-)?config="([^"]+)"', webpage, + 'data player config', default='{}'), video_id) if config.get('source_type') == 'vine': return self.url_result(config['player_url'], 'Vine') + periscope_url = PeriscopeIE._extract_url(webpage) + if periscope_url: + return self.url_result(periscope_url, PeriscopeIE.ie_key()) + def _search_dimensions_in_video_url(a_format, video_url): m = re.search(r'/(?P\d+)x(?P\d+)/', video_url) if m: @@ -190,7 +201,7 @@ class TwitterIE(InfoExtractor): 'id': '643211948184596480', 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', @@ -206,7 +217,7 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai', 'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"', - 'thumbnail': 're:^https?://.*\.png', + 'thumbnail': r're:^https?://.*\.png', 'uploader': 'Gifs', 'uploader_id': 'giphz', }, @@ -244,10 +255,10 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'Donte The Dumbass - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'Donte The Dumbass on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', - 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'Donte The Dumbass', + 'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'JG', 'uploader_id': 'jaydingeer', }, 'params': { @@ -278,6 +289,18 @@ class TwitterIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, { + 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', + 'info_dict': { + 'id': '1zqKVVlkqLaKB', + 'ext': 'mp4', + 'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence', + 'upload_date': '20160923', + 'uploader_id': 'OPP_HSD', + 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', + 'timestamp': 1474613214, + }, + 'add_ie': ['Periscope'], }] def _real_extract(self, url): @@ -328,13 +351,22 @@ class TwitterIE(InfoExtractor): }) return info + twitter_card_url = None if 'class="PlayableMedia' in webpage: + twitter_card_url = '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid) + else: + twitter_card_iframe_url = self._search_regex( + r'data-full-card-iframe-url=([\'"])(?P(?:(?!\1).)+)\1', + webpage, 'Twitter card iframe URL', default=None, group='url') + if twitter_card_iframe_url: + twitter_card_url = compat_urlparse.urljoin(url, twitter_card_iframe_url) + + if twitter_card_url: info.update({ '_type': 'url_transparent', 'ie_key': 'TwitterCard', - 'url': '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid), + 'url': twitter_card_url, }) - return info raise ExtractorError('There\'s no video in this tweet.') @@ -342,7 +374,7 @@ class TwitterIE(InfoExtractor): class TwitterAmplifyIE(TwitterBaseIE): IE_NAME = 'twitter:amplify' - _VALID_URL = 'https?://amp\.twimg\.com/v/(?P[0-9a-f\-]{36})' + _VALID_URL = r'https?://amp\.twimg\.com/v/(?P[0-9a-f\-]{36})' _TEST = { 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 89b869559..cce29c6e0 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_str, compat_urllib_request, compat_urlparse, ) @@ -207,7 +208,7 @@ class UdemyIE(InfoExtractor): if youtube_url: return self.url_result(youtube_url, 'Youtube') - video_id = asset['id'] + video_id = compat_str(asset['id']) thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl') duration = float_or_none(asset.get('data', {}).get('duration')) @@ -307,7 +308,7 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://www\.udemy\.com/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?udemy\.com/(?P[^/?#&]+)' _TESTS = [] @classmethod diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py index 57dd73aef..daf45d0b4 100644 --- a/youtube_dl/extractor/udn.py +++ b/youtube_dl/extractor/udn.py @@ -23,7 +23,7 @@ class UDNEmbedIE(InfoExtractor): 'id': '300040', 'ext': 'mp4', 'title': '生物老師男變女 全校挺"做自己"', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/uktvplay.py b/youtube_dl/extractor/uktvplay.py new file mode 100644 index 000000000..2137502a1 --- /dev/null +++ b/youtube_dl/extractor/uktvplay.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class UKTVPlayIE(InfoExtractor): + _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P\d+)' + _TEST = { + 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', + 'md5': '', + 'info_dict': { + 'id': '2117008346001', + 'ext': 'mp4', + 'title': 'Pincers', + 'description': 'Pincers', + 'uploader_id': '1242911124001', + 'upload_date': '20130124', + 'timestamp': 1359049267, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download MPD manifest'] + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'BrightcoveNew', video_id) diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py new file mode 100644 index 000000000..e67083004 --- /dev/null +++ b/youtube_dl/extractor/uol.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_duration, + update_url_query, + str_or_none, +) + + +class UOLIE(InfoExtractor): + IE_NAME = 'uol.com.br' + _VALID_URL = r'https?://(?:.+?\.)?uol\.com\.br/.*?(?:(?:mediaId|v)=|view/(?:[a-z0-9]+/)?|video(?:=|/(?:\d{4}/\d{2}/\d{2}/)?))(?P\d+|[\w-]+-[A-Z0-9]+)' + _TESTS = [{ + 'url': 'http://player.mais.uol.com.br/player_video_v3.swf?mediaId=15951931', + 'md5': '25291da27dc45e0afb5718a8603d3816', + 'info_dict': { + 'id': '15951931', + 'ext': 'mp4', + 'title': 'Miss simpatia é encontrada morta', + 'description': 'md5:3f8c11a0c0556d66daf7e5b45ef823b2', + } + }, { + 'url': 'http://tvuol.uol.com.br/video/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', + 'md5': 'e41a2fb7b7398a3a46b6af37b15c00c9', + 'info_dict': { + 'id': '15954259', + 'ext': 'mp4', + 'title': 'Incêndio destrói uma das maiores casas noturnas de Londres', + 'description': 'Em Londres, um incêndio destruiu uma das maiores boates da cidade. Não há informações sobre vítimas.', + } + }, { + 'url': 'http://mais.uol.com.br/static/uolplayer/index.html?mediaId=15951931', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/15954259', + 'only_matching': True, + }, { + 'url': 'http://noticias.band.uol.com.br/brasilurgente/video/2016/08/05/15951931/miss-simpatia-e-encontrada-morta.html', + 'only_matching': True, + }, { + 'url': 'http://videos.band.uol.com.br/programa.asp?e=noticias&pr=brasil-urgente&v=15951931&t=Policia-desmonte-base-do-PCC-na-Cracolandia', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/cphaa0gl2x8r/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', + 'only_matching': True, + }, { + 'url': 'http://noticias.uol.com.br//videos/assistir.htm?video=rafaela-silva-inspira-criancas-no-judo-04024D983968D4C95326', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/e0qbgxid79uv/15275470', + 'only_matching': True, + }] + + _FORMATS = { + '2': { + 'width': 640, + 'height': 360, + }, + '5': { + 'width': 1080, + 'height': 720, + }, + '6': { + 'width': 426, + 'height': 240, + }, + '7': { + 'width': 1920, + 'height': 1080, + }, + '8': { + 'width': 192, + 'height': 144, + }, + '9': { + 'width': 568, + 'height': 320, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media_id = None + + if video_id.isdigit(): + media_id = video_id + + if not media_id: + embed_page = self._download_webpage( + 'https://jsuol.com.br/c/tv/uol/embed/?params=[embed,%s]' % video_id, + video_id, 'Downloading embed page', fatal=False) + if embed_page: + media_id = self._search_regex( + (r'uol\.com\.br/(\d+)', r'mediaId=(\d+)'), + embed_page, 'media id', default=None) + + if not media_id: + webpage = self._download_webpage(url, video_id) + media_id = self._search_regex(r'mediaId=(\d+)', webpage, 'media id') + + video_data = self._download_json( + 'http://mais.uol.com.br/apiuol/v3/player/getMedia/%s.json' % media_id, + media_id)['item'] + title = video_data['title'] + + query = { + 'ver': video_data.get('numRevision', 2), + 'r': 'http://mais.uol.com.br', + } + formats = [] + for f in video_data.get('formats', []): + f_url = f.get('url') or f.get('secureUrl') + if not f_url: + continue + format_id = str_or_none(f.get('id')) + fmt = { + 'format_id': format_id, + 'url': update_url_query(f_url, query), + } + fmt.update(self._FORMATS.get(format_id, {})) + formats.append(fmt) + self._sort_formats(formats) + + tags = [] + for tag in video_data.get('tags', []): + tag_description = tag.get('description') + if not tag_description: + continue + tags.append(tag_description) + + return { + 'id': media_id, + 'title': title, + 'description': clean_html(video_data.get('desMedia')), + 'thumbnail': video_data.get('thumbnail'), + 'duration': int_or_none(video_data.get('durationSeconds')) or parse_duration(video_data.get('duration')), + 'tags': tags, + 'formats': formats, + } diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py new file mode 100644 index 000000000..f06bf5b12 --- /dev/null +++ b/youtube_dl/extractor/uplynk.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class UplynkIE(InfoExtractor): + IE_NAME = 'uplynk' + _VALID_URL = r'https?://.*?\.uplynk\.com/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P[^&]+))?' + _TEST = { + 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', + 'info_dict': { + 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', + 'ext': 'mp4', + 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', + 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _extract_uplynk_info(self, uplynk_content_url): + path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() + display_id = video_id or external_id + formats = self._extract_m3u8_formats( + 'http://content.uplynk.com/%s.m3u8' % path, + display_id, 'mp4', 'm3u8_native') + if session_id: + for f in formats: + f['extra_param_to_segment_url'] = 'pbs=' + session_id + self._sort_formats(formats) + asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) + if asset.get('error') == 1: + raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True) + + return { + 'id': asset['asset'], + 'title': asset['desc'], + 'thumbnail': asset.get('default_poster_url'), + 'duration': float_or_none(asset.get('duration')), + 'uploader_id': asset.get('owner'), + 'formats': formats, + } + + def _real_extract(self, url): + return self._extract_uplynk_info(url) + + +class UplynkPreplayIE(UplynkIE): + IE_NAME = 'uplynk:preplay' + _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.json' + _TEST = None + + def _real_extract(self, url): + path, external_id, video_id = re.match(self._VALID_URL, url).groups() + display_id = video_id or external_id + preplay = self._download_json(url, display_id) + content_url = 'http://content.uplynk.com/%s.m3u8' % path + session_id = preplay.get('sid') + if session_id: + content_url += '?pbs=' + session_id + return self._extract_uplynk_info(content_url) diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py index 8872cfcb2..8f6edab4b 100644 --- a/youtube_dl/extractor/urort.py +++ b/youtube_dl/extractor/urort.py @@ -21,7 +21,7 @@ class UrortIE(InfoExtractor): 'id': '33124-24', 'ext': 'mp3', 'title': 'The Bomb', - 'thumbnail': 're:^https?://.+\.jpg', + 'thumbnail': r're:^https?://.+\.jpg', 'uploader': 'Gerilja', 'uploader_id': 'Gerilja', 'upload_date': '20100323', diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index ce3bf6b02..8e6fd4731 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -5,17 +5,20 @@ from .common import InfoExtractor class URPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?urplay\.se/program/(?P[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?ur(?:play|skola)\.se/(?:program|Produkter)/(?P[0-9]+)' + _TESTS = [{ 'url': 'http://urplay.se/program/190031-tripp-trapp-trad-sovkudde', - 'md5': '15ca67b63fd8fb320ac2bcd854bad7b6', + 'md5': 'ad5f0de86f16ca4c8062cd103959a9eb', 'info_dict': { 'id': '190031', 'ext': 'mp4', 'title': 'Tripp, Trapp, Träd : Sovkudde', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', - } - } + }, + }, { + 'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -27,30 +30,17 @@ class URPlayIE(InfoExtractor): formats = [] for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): - file_rtmp = urplayer_data.get('file_rtmp' + quality_attr) - if file_rtmp: - formats.append({ - 'url': 'rtmp://%s/urplay/mp4:%s' % (host, file_rtmp), - 'format_id': quality + '-rtmp', - 'ext': 'flv', - 'preference': preference, - }) file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) if file_http: - file_http_base_url = 'http://%s/%s' % (host, file_http) - formats.extend(self._extract_f4m_formats( - file_http_base_url + 'manifest.f4m', video_id, - preference, '%s-hds' % quality, fatal=False)) - formats.extend(self._extract_m3u8_formats( - file_http_base_url + 'playlist.m3u8', video_id, 'mp4', - 'm3u8_native', preference, '%s-hls' % quality, fatal=False)) + formats.extend(self._extract_wowza_formats( + 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp'])) self._sort_formats(formats) subtitles = {} for subtitle in urplayer_data.get('subtitles', []): subtitle_url = subtitle.get('file') kind = subtitle.get('kind') - if subtitle_url or kind and kind != 'captions': + if not subtitle_url or (kind and kind != 'captions'): continue subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ 'url': subtitle_url, diff --git a/youtube_dl/extractor/usanetwork.py b/youtube_dl/extractor/usanetwork.py new file mode 100644 index 000000000..823340776 --- /dev/null +++ b/youtube_dl/extractor/usanetwork.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..utils import ( + extract_attributes, + smuggle_url, + update_url_query, +) + + +class USANetworkIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P[^/?#]+)' + _TEST = { + 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', + 'md5': '33c0d2ba381571b414024440d08d57fd', + 'info_dict': { + 'id': '3086229', + 'ext': 'mp4', + 'title': 'HPE Cybersecurity', + 'description': 'The more we digitize our world, the more vulnerable we are.', + 'upload_date': '20160818', + 'timestamp': 1471535460, + 'uploader': 'NBCU-USA', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + player_params = extract_attributes(self._search_regex( + r'(]+data-usa-tve-player-container[^>]*>)', webpage, 'player params')) + video_id = player_params['data-mpx-guid'] + title = player_params['data-episode-title'] + + account_pid, path = re.search( + r'data-src="(?:https?)?//player\.theplatform\.com/p/([^/]+)/.*?/(media/guid/\d+/\d+)', + webpage).groups() + + query = { + 'mbr': 'true', + } + if player_params.get('data-is-full-episode') == '1': + query['manifest'] = 'm3u' + + if player_params.get('data-entitlement') == 'auth': + adobe_pass = {} + drupal_settings = self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings', fatal=False) + if drupal_settings: + drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) + if drupal_settings: + adobe_pass = drupal_settings.get('adobePass', {}) + resource = self._get_mvpd_resource( + adobe_pass.get('adobePassResourceId', 'usa'), + title, video_id, player_params.get('data-episode-rating', 'TV-14')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) + + info = self._search_json_ld(webpage, video_id, default={}) + info.update({ + '_type': 'url_transparent', + 'url': smuggle_url(update_url_query( + 'http://link.theplatform.com/s/%s/%s' % (account_pid, path), + query), {'force_smil_url': True}), + 'id': video_id, + 'title': title, + 'series': player_params.get('data-show-title'), + 'episode': title, + 'ie_key': 'ThePlatform', + }) + return info diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 54605d863..5737d4d16 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -1,20 +1,25 @@ from __future__ import unicode_literals +import random import re from .common import InfoExtractor from ..compat import ( + compat_str, compat_urlparse, ) from ..utils import ( + encode_data_uri, ExtractorError, int_or_none, float_or_none, + mimetype2ext, + str_or_none, ) class UstreamIE(InfoExtractor): - _VALID_URL = r'https?://www\.ustream\.tv/(?Precorded|embed|embed/recorded)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?Precorded|embed|embed/recorded)/(?P\d+)' IE_NAME = 'ustream' _TESTS = [{ 'url': 'http://www.ustream.tv/recorded/20274954', @@ -47,8 +52,115 @@ class UstreamIE(InfoExtractor): 'id': '10299409', }, 'playlist_count': 3, + }, { + 'url': 'http://www.ustream.tv/recorded/91343263', + 'info_dict': { + 'id': '91343263', + 'ext': 'mp4', + 'title': 'GitHub Universe - General Session - Day 1', + 'upload_date': '20160914', + 'description': 'GitHub Universe - General Session - Day 1', + 'timestamp': 1473872730, + 'uploader': 'wa0dnskeqkr', + 'uploader_id': '38977840', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+?src=(["\'])(?Phttp://www\.ustream\.tv/embed/.+?)\1', webpage) + if mobj is not None: + return mobj.group('url') + + def _get_stream_info(self, url, video_id, app_id_ver, extra_note=None): + def num_to_hex(n): + return hex(n)[2:] + + rnd = random.randrange + + if not extra_note: + extra_note = '' + + conn_info = self._download_json( + 'http://r%d-1-%s-recorded-lp-live.ums.ustream.tv/1/ustream' % (rnd(1e8), video_id), + video_id, note='Downloading connection info' + extra_note, + query={ + 'type': 'viewer', + 'appId': app_id_ver[0], + 'appVersion': app_id_ver[1], + 'rsid': '%s:%s' % (num_to_hex(rnd(1e8)), num_to_hex(rnd(1e8))), + 'rpin': '_rpin.%d' % rnd(1e15), + 'referrer': url, + 'media': video_id, + 'application': 'recorded', + }) + host = conn_info[0]['args'][0]['host'] + connection_id = conn_info[0]['args'][0]['connectionId'] + + return self._download_json( + 'http://%s/1/ustream?connectionId=%s' % (host, connection_id), + video_id, note='Downloading stream info' + extra_note) + + def _get_streams(self, url, video_id, app_id_ver): + # Sometimes the return dict does not have 'stream' + for trial_count in range(3): + stream_info = self._get_stream_info( + url, video_id, app_id_ver, + extra_note=' (try %d)' % (trial_count + 1) if trial_count > 0 else '') + if 'stream' in stream_info[0]['args'][0]: + return stream_info[0]['args'][0]['stream'] + return [] + + def _parse_segmented_mp4(self, dash_stream_info): + def resolve_dash_template(template, idx, chunk_hash): + return template.replace('%', compat_str(idx), 1).replace('%', chunk_hash) + + formats = [] + for stream in dash_stream_info['streams']: + # Use only one provider to avoid too many formats + provider = dash_stream_info['providers'][0] + fragments = [{ + 'url': resolve_dash_template( + provider['url'] + stream['initUrl'], 0, dash_stream_info['hashes']['0']) + }] + for idx in range(dash_stream_info['videoLength'] // dash_stream_info['chunkTime']): + fragments.append({ + 'url': resolve_dash_template( + provider['url'] + stream['segmentUrl'], idx, + dash_stream_info['hashes'][compat_str(idx // 10 * 10)]) + }) + content_type = stream['contentType'] + kind = content_type.split('/')[0] + f = { + 'format_id': '-'.join(filter(None, [ + 'dash', kind, str_or_none(stream.get('bitrate'))])), + 'protocol': 'http_dash_segments', + # TODO: generate a MPD doc for external players? + 'url': encode_data_uri(b'', 'text/xml'), + 'ext': mimetype2ext(content_type), + 'height': stream.get('height'), + 'width': stream.get('width'), + 'fragments': fragments, + } + if kind == 'video': + f.update({ + 'vcodec': stream.get('codec'), + 'acodec': 'none', + 'vbr': stream.get('bitrate'), + }) + else: + f.update({ + 'vcodec': 'none', + 'acodec': stream.get('codec'), + 'abr': stream.get('bitrate'), + }) + formats.append(f) + return formats + def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('id') @@ -86,7 +198,22 @@ class UstreamIE(InfoExtractor): 'url': video_url, 'ext': format_id, 'filesize': filesize, - } for format_id, video_url in video['media_urls'].items()] + } for format_id, video_url in video['media_urls'].items() if video_url] + + if not formats: + hls_streams = self._get_streams(url, video_id, app_id_ver=(11, 2)) + if hls_streams: + # m3u8_native leads to intermittent ContentTooShortError + formats.extend(self._extract_m3u8_formats( + hls_streams[0]['url'], video_id, ext='mp4', m3u8_id='hls')) + + ''' + # DASH streams handling is incomplete as 'url' is missing + dash_streams = self._get_streams(url, video_id, app_id_ver=(3, 1)) + if dash_streams: + formats.extend(self._parse_segmented_mp4(dash_streams)) + ''' + self._sort_formats(formats) description = video.get('description') @@ -117,7 +244,7 @@ class UstreamIE(InfoExtractor): class UstreamChannelIE(InfoExtractor): - _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P.+)' + _VALID_URL = r'https?://(?:www\.)?ustream\.tv/channel/(?P.+)' IE_NAME = 'ustream:channel' _TEST = { 'url': 'http://www.ustream.tv/channel/channeljapan', diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py index 3484a2046..56509beed 100644 --- a/youtube_dl/extractor/ustudio.py +++ b/youtube_dl/extractor/ustudio.py @@ -22,7 +22,7 @@ class UstudioIE(InfoExtractor): 'ext': 'mp4', 'title': 'San Francisco: Golden Gate Bridge', 'description': 'md5:23925500697f2c6d4830e387ba51a9be', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20111107', 'uploader': 'Tony Farley', } diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py index 84698371a..f474ed73f 100644 --- a/youtube_dl/extractor/varzesh3.py +++ b/youtube_dl/extractor/varzesh3.py @@ -22,7 +22,7 @@ class Varzesh3IE(InfoExtractor): 'ext': 'mp4', 'title': '۵ واکنش برتر دروازه‌بانان؛هفته ۲۶ بوندسلیگا', 'description': 'فصل ۲۰۱۵-۲۰۱۴', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'HTTP 404 Error', }, { @@ -67,7 +67,7 @@ class Varzesh3IE(InfoExtractor): webpage, display_id, default=None) if video_id is None: video_id = self._search_regex( - 'var\s+VideoId\s*=\s*(\d+);', webpage, 'video id', + r'var\s+VideoId\s*=\s*(\d+);', webpage, 'video id', default=display_id) return { diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index dff1bb702..bef639462 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -1,18 +1,42 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, -) +from ..utils import ExtractorError class Vbox7IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P[^/]+)' - _TEST = { + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?vbox7\.com/ + (?: + play:| + (?: + emb/external\.php| + player/ext\.swf + )\?.*?\bvid= + ) + (?P[\da-fA-F]+) + ''' + _TESTS = [{ + 'url': 'http://vbox7.com/play:0946fff23c', + 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', + 'info_dict': { + 'id': '0946fff23c', + 'ext': 'mp4', + 'title': 'Борисов: Притеснен съм за бъдещето на България', + 'description': 'По думите му е опасно страната ни да бъде обявена за "сигурна"', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1470982814, + 'upload_date': '20160812', + 'uploader': 'zdraveibulgaria', + }, + 'params': { + 'proxy': '127.0.0.1:8118', + }, + }, { 'url': 'http://vbox7.com/play:249bb972c2', 'md5': '99f65c0c9ef9b682b97313e052734c3f', 'info_dict': { @@ -20,43 +44,61 @@ class Vbox7IE(InfoExtractor): 'ext': 'mp4', 'title': 'Смях! Чудо - чист за секунди - Скрита камера', }, - } + 'skip': 'georestricted', + }, { + 'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1', + 'only_matching': True, + }, { + 'url': 'http://i49.vbox7.com/player/ext.swf?vid=0946fff23c&autoplay=1', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+src=(?P["\'])(?P(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)', + webpage) + if mobj: + return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) - # need to get the page 3 times for the correct jsSecretToken cookie - # which is necessary for the correct title - def get_session_id(): - redirect_page = self._download_webpage(url, video_id) - session_id_url = self._search_regex( - r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page, - 'session id url') - self._download_webpage( - compat_urlparse.urljoin(url, session_id_url), video_id, - 'Getting session id') + response = self._download_json( + 'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id, + video_id) - get_session_id() - get_session_id() + if 'error' in response: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']), expected=True) - webpage = self._download_webpage(url, video_id, - 'Downloading redirect page') + video = response['options'] - title = self._html_search_regex(r'(.*)', - webpage, 'title').split('/')[0].strip() + title = video['title'] + video_url = video['src'] - info_url = 'http://vbox7.com/play/magare.do' - data = urlencode_postdata({'as3': '1', 'vid': video_id}) - info_request = sanitized_Request(info_url, data) - info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage') - if info_response is None: - raise ExtractorError('Unable to extract the media url') - (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) + if '/na.mp4' in video_url: + self.raise_geo_restricted() - return { + uploader = video.get('uploader') + + webpage = self._download_webpage( + 'http://vbox7.com/play:%s' % video_id, video_id, fatal=None) + + info = {} + + if webpage: + info = self._search_json_ld( + webpage.replace('"/*@context"', '"@context"'), video_id, + fatal=False) + + info.update({ 'id': video_id, - 'url': final_url, 'title': title, - 'thumbnail': thumbnail_url, - } + 'url': video_url, + 'uploader': uploader, + 'thumbnail': self._proto_relative_url( + info.get('thumbnail') or self._og_search_thumbnail(webpage), + 'http:'), + }) + return info diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py index 2cd617b91..80a643dfe 100644 --- a/youtube_dl/extractor/vessel.py +++ b/youtube_dl/extractor/vessel.py @@ -13,7 +13,7 @@ from ..utils import ( class VesselIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P[0-9a-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P[0-9a-zA-Z-_]+)' _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' _LOGIN_URL = 'https://www.vessel.com/api/account/login' _NETRC_MACHINE = 'vessel' @@ -24,7 +24,7 @@ class VesselIE(InfoExtractor): 'id': 'HDN7G5UMs', 'ext': 'mp4', 'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150317', 'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', 'timestamp': int, @@ -32,12 +32,18 @@ class VesselIE(InfoExtractor): }, { 'url': 'https://www.vessel.com/embed/G4U7gUJ6a?w=615&h=346', 'only_matching': True, + }, { + 'url': 'https://www.vessel.com/videos/F01_dsLj1', + 'only_matching': True, + }, { + 'url': 'https://www.vessel.com/videos/RRX-sir-J', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return [url for _, url in re.findall( - r']+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z]+.*?)\1', + r']+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z-_]+.*?)\1', webpage)] @staticmethod diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py index cb64ae0bd..5ab716880 100644 --- a/youtube_dl/extractor/vesti.py +++ b/youtube_dl/extractor/vesti.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 388b4debe..c4e37f694 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -4,9 +4,9 @@ import re from .common import InfoExtractor from ..compat import ( - compat_etree_fromstring, compat_str, compat_urlparse, + compat_HTTPError, ) from ..utils import ( ExtractorError, @@ -31,7 +31,7 @@ class VevoIE(VevoBaseIE): (currently used by MTVIE and MySpaceIE) ''' _VALID_URL = r'''(?x) - (?:https?://www\.vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?| + (?:https?://(?:www\.)?vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?| https?://cache\.vevo\.com/m/html/embed\.html\?video=| https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| vevo:) @@ -51,7 +51,7 @@ class VevoIE(VevoBaseIE): 'artist': 'Hurts', 'genre': 'Pop', }, - 'expected_warnings': ['Unable to download SMIL file'], + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], }, { 'note': 'v3 SMIL format', 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', @@ -67,7 +67,7 @@ class VevoIE(VevoBaseIE): 'artist': 'Cassadee Pope', 'genre': 'Country', }, - 'expected_warnings': ['Unable to download SMIL file'], + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], }, { 'note': 'Age-limited video', 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', @@ -83,7 +83,7 @@ class VevoIE(VevoBaseIE): 'artist': 'Justin Timberlake', 'genre': 'Pop', }, - 'expected_warnings': ['Unable to download SMIL file'], + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], }, { 'note': 'No video_info', 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', @@ -91,15 +91,33 @@ class VevoIE(VevoBaseIE): 'info_dict': { 'id': 'USUV71503000', 'ext': 'mp4', - 'title': 'K Camp - Till I Die', + 'title': 'K Camp ft. T.I. - Till I Die', 'age_limit': 18, 'timestamp': 1449468000, 'upload_date': '20151207', 'uploader': 'K Camp', 'track': 'Till I Die', 'artist': 'K Camp', - 'genre': 'Rap/Hip-Hop', + 'genre': 'Hip-Hop', }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'Featured test', + 'url': 'https://www.vevo.com/watch/lemaitre/Wait/USUV71402190', + 'md5': 'd28675e5e8805035d949dc5cf161071d', + 'info_dict': { + 'id': 'USUV71402190', + 'ext': 'mp4', + 'title': 'Lemaitre ft. LoLo - Wait', + 'age_limit': 0, + 'timestamp': 1413432000, + 'upload_date': '20141016', + 'uploader': 'Lemaitre', + 'track': 'Wait', + 'artist': 'Lemaitre', + 'genre': 'Electronic', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], }, { 'note': 'Only available via webpage', 'url': 'http://www.vevo.com/watch/GBUV71600656', @@ -122,21 +140,6 @@ class VevoIE(VevoBaseIE): 'url': 'http://www.vevo.com/watch/INS171400764', 'only_matching': True, }] - _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com' - _SOURCE_TYPES = { - 0: 'youtube', - 1: 'brightcove', - 2: 'http', - 3: 'hls_ios', - 4: 'hls', - 5: 'smil', # http - 7: 'f4m_cc', - 8: 'f4m_ak', - 9: 'f4m_l3', - 10: 'ism', - 13: 'smil', # rtmp - 18: 'dash', - } _VERSIONS = { 0: 'youtube', # only in AuthenticateVideo videoVersions 1: 'level3', @@ -145,41 +148,6 @@ class VevoIE(VevoBaseIE): 4: 'amazon', } - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - formats = [] - els = smil.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') - for el in els: - src = el.attrib['src'] - m = re.match(r'''(?xi) - (?P[a-z0-9]+): - (?P - [/a-z0-9]+ # The directory and main part of the URL - _(?P[0-9]+)k - _(?P[0-9]+)x(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - \.[a-z0-9]+ # File extension - )''', src) - if not m: - continue - - format_url = self._SMIL_BASE_URL + m.group('path') - formats.append({ - 'url': format_url, - 'format_id': 'smil_' + m.group('tbr'), - 'vcodec': m.group('vcodec'), - 'acodec': m.group('acodec'), - 'tbr': int(m.group('tbr')), - 'vbr': int(m.group('vbr')), - 'abr': int(m.group('abr')), - 'ext': m.group('ext'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - return formats - def _initialize_api(self, video_id): req = sanitized_Request( 'http://www.vevo.com/auth', data=b'') @@ -188,7 +156,7 @@ class VevoIE(VevoBaseIE): note='Retrieving oauth token', errnote='Unable to retrieve oauth token') - if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage: + if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage): self.raise_geo_restricted( '%s said: This page is currently unavailable in your region' % self.IE_NAME) @@ -196,145 +164,91 @@ class VevoIE(VevoBaseIE): self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] def _call_api(self, path, *args, **kwargs): - return self._download_json(self._api_url_template % path, *args, **kwargs) + try: + data = self._download_json(self._api_url_template % path, *args, **kwargs) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + errors = self._parse_json(e.cause.read().decode(), None)['errors'] + error_message = ', '.join([error['message'] for error in errors]) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + raise + return data def _real_extract(self, url): video_id = self._match_id(url) - json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id - response = self._download_json( - json_url, video_id, 'Downloading video info', - 'Unable to download info', fatal=False) or {} - video_info = response.get('video') or {} + self._initialize_api(video_id) + + video_info = self._call_api( + 'video/%s' % video_id, video_id, 'Downloading api video info', + 'Failed to download video info') + + video_versions = self._call_api( + 'video/%s/streams' % video_id, video_id, + 'Downloading video versions info', + 'Failed to download video versions info', + fatal=False) + + # Some videos are only available via webpage (e.g. + # https://github.com/rg3/youtube-dl/issues/9366) + if not video_versions: + webpage = self._download_webpage(url, video_id) + video_versions = self._extract_json(webpage, video_id, 'streams')[video_id][0] + + uploader = None artist = None featured_artist = None - uploader = None - view_count = None + artists = video_info.get('artists') + for curr_artist in artists: + if curr_artist.get('role') == 'Featured': + featured_artist = curr_artist['name'] + else: + artist = uploader = curr_artist['name'] + formats = [] + for video_version in video_versions: + version = self._VERSIONS.get(video_version['version']) + version_url = video_version.get('url') + if not version_url: + continue - if not video_info: - try: - self._initialize_api(video_id) - except ExtractorError: - ytid = response.get('errorInfo', {}).get('ytid') - if ytid: - self.report_warning( - 'Video is geoblocked, trying with the YouTube video %s' % ytid) - return self.url_result(ytid, 'Youtube', ytid) - - raise - - video_info = self._call_api( - 'video/%s' % video_id, video_id, 'Downloading api video info', - 'Failed to download video info') - - video_versions = self._call_api( - 'video/%s/streams' % video_id, video_id, - 'Downloading video versions info', - 'Failed to download video versions info', - fatal=False) - - # Some videos are only available via webpage (e.g. - # https://github.com/rg3/youtube-dl/issues/9366) - if not video_versions: - webpage = self._download_webpage(url, video_id) - video_versions = self._extract_json(webpage, video_id, 'streams')[video_id][0] - - timestamp = parse_iso8601(video_info.get('releaseDate')) - artists = video_info.get('artists') - if artists: - artist = uploader = artists[0]['name'] - view_count = int_or_none(video_info.get('views', {}).get('total')) - - for video_version in video_versions: - version = self._VERSIONS.get(video_version['version']) - version_url = video_version.get('url') - if not version_url: + if '.ism' in version_url: + continue + elif '.mpd' in version_url: + formats.extend(self._extract_mpd_formats( + version_url, video_id, mpd_id='dash-%s' % version, + note='Downloading %s MPD information' % version, + errnote='Failed to download %s MPD information' % version, + fatal=False)) + elif '.m3u8' in version_url: + formats.extend(self._extract_m3u8_formats( + version_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls-%s' % version, + note='Downloading %s m3u8 information' % version, + errnote='Failed to download %s m3u8 information' % version, + fatal=False)) + else: + m = re.search(r'''(?xi) + _(?P[0-9]+)x(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + \.(?P[a-z0-9]+)''', version_url) + if not m: continue - if '.ism' in version_url: - continue - elif '.mpd' in version_url: - formats.extend(self._extract_mpd_formats( - version_url, video_id, mpd_id='dash-%s' % version, - note='Downloading %s MPD information' % version, - errnote='Failed to download %s MPD information' % version, - fatal=False)) - elif '.m3u8' in version_url: - formats.extend(self._extract_m3u8_formats( - version_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls-%s' % version, - note='Downloading %s m3u8 information' % version, - errnote='Failed to download %s m3u8 information' % version, - fatal=False)) - else: - m = re.search(r'''(?xi) - _(?P[0-9]+)x(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - \.(?P[a-z0-9]+)''', version_url) - if not m: - continue - - formats.append({ - 'url': version_url, - 'format_id': 'http-%s-%s' % (version, video_version['quality']), - 'vcodec': m.group('vcodec'), - 'acodec': m.group('acodec'), - 'vbr': int(m.group('vbr')), - 'abr': int(m.group('abr')), - 'ext': m.group('ext'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - else: - timestamp = int_or_none(self._search_regex( - r'/Date\((\d+)\)/', - video_info['releaseDate'], 'release date', fatal=False), - scale=1000) - artists = video_info.get('mainArtists') - if artists: - artist = uploader = artists[0]['artistName'] - - featured_artists = video_info.get('featuredArtists') - if featured_artists: - featured_artist = featured_artists[0]['artistName'] - - smil_parsed = False - for video_version in video_info['videoVersions']: - version = self._VERSIONS.get(video_version['version']) - if version == 'youtube': - continue - else: - source_type = self._SOURCE_TYPES.get(video_version['sourceType']) - renditions = compat_etree_fromstring(video_version['data']) - if source_type == 'http': - for rend in renditions.findall('rendition'): - attr = rend.attrib - formats.append({ - 'url': attr['url'], - 'format_id': 'http-%s-%s' % (version, attr['name']), - 'height': int_or_none(attr.get('frameheight')), - 'width': int_or_none(attr.get('frameWidth')), - 'tbr': int_or_none(attr.get('totalBitrate')), - 'vbr': int_or_none(attr.get('videoBitrate')), - 'abr': int_or_none(attr.get('audioBitrate')), - 'vcodec': attr.get('videoCodec'), - 'acodec': attr.get('audioCodec'), - }) - elif source_type == 'hls': - formats.extend(self._extract_m3u8_formats( - renditions.find('rendition').attrib['url'], video_id, - 'mp4', 'm3u8_native', m3u8_id='hls-%s' % version, - note='Downloading %s m3u8 information' % version, - errnote='Failed to download %s m3u8 information' % version, - fatal=False)) - elif source_type == 'smil' and version == 'level3' and not smil_parsed: - formats.extend(self._extract_smil_formats( - renditions.find('rendition').attrib['url'], video_id, False)) - smil_parsed = True + formats.append({ + 'url': version_url, + 'format_id': 'http-%s-%s' % (version, video_version['quality']), + 'vcodec': m.group('vcodec'), + 'acodec': m.group('acodec'), + 'vbr': int(m.group('vbr')), + 'abr': int(m.group('abr')), + 'ext': m.group('ext'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) self._sort_formats(formats) track = video_info['title'] @@ -355,17 +269,15 @@ class VevoIE(VevoBaseIE): else: age_limit = None - duration = video_info.get('duration') - return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': video_info.get('imageUrl') or video_info.get('thumbnailUrl'), - 'timestamp': timestamp, + 'timestamp': parse_iso8601(video_info.get('releaseDate')), 'uploader': uploader, - 'duration': duration, - 'view_count': view_count, + 'duration': int_or_none(video_info.get('duration')), + 'view_count': int_or_none(video_info.get('views', {}).get('total')), 'age_limit': age_limit, 'track': track, 'artist': uploader, @@ -374,7 +286,7 @@ class VevoIE(VevoBaseIE): class VevoPlaylistIE(VevoBaseIE): - _VALID_URL = r'https?://www\.vevo\.com/watch/(?Pplaylist|genre)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vevo\.com/watch/(?Pplaylist|genre)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29', diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index b11cd254c..8a574bc26 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -8,6 +8,7 @@ from .xstream import XstreamIE from ..utils import ( ExtractorError, float_or_none, + try_get, ) @@ -21,6 +22,7 @@ class VGTVIE(XstreamIE): 'fvn.no/fvntv': 'fvntv', 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', + 'tv.aftonbladet.se/abtv': 'abtv', } _APP_NAME_TO_VENDOR = { @@ -29,6 +31,7 @@ class VGTVIE(XstreamIE): 'satv': 'sa', 'fvntv': 'fvn', 'aptv': 'ap', + 'abtv': 'ab', } _VALID_URL = r'''(?x) @@ -39,7 +42,8 @@ class VGTVIE(XstreamIE): /? (?: \#!/(?:video|live)/| - embed?.*id= + embed?.*id=| + articles/ )| (?P %s @@ -57,7 +61,7 @@ class VGTVIE(XstreamIE): 'ext': 'mp4', 'title': 'Hevnen er søt: Episode 10 - Abu', 'description': 'md5:e25e4badb5f544b04341e14abdc72234', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 648.000, 'timestamp': 1404626400, 'upload_date': '20140706', @@ -72,7 +76,7 @@ class VGTVIE(XstreamIE): 'ext': 'flv', 'title': 'OPPTAK: VGTV følger EM-kvalifiseringen', 'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 9103.0, 'timestamp': 1410113864, 'upload_date': '20140907', @@ -92,7 +96,7 @@ class VGTVIE(XstreamIE): 'ext': 'mp4', 'title': 'V75 fra Solvalla 30.05.15', 'description': 'md5:b3743425765355855f88e096acc93231', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 25966, 'timestamp': 1432975582, 'upload_date': '20150530', @@ -129,6 +133,19 @@ class VGTVIE(XstreamIE): 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil', 'only_matching': True, }, + { + # geoblocked + 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', + 'only_matching': True, + }, + { + 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', + 'only_matching': True, + }, + { + 'url': 'abtv:140026', + 'only_matching': True, + } ] def _real_extract(self, url): @@ -183,7 +200,7 @@ class VGTVIE(XstreamIE): format_info = { 'url': mp4_url, } - mobj = re.search('(\d+)_(\d+)_(\d+)', mp4_url) + mobj = re.search(r'(\d+)_(\d+)_(\d+)', mp4_url) if mobj: tbr = int(mobj.group(3)) format_info.update({ @@ -196,6 +213,12 @@ class VGTVIE(XstreamIE): info['formats'].extend(formats) + if not info['formats']: + properties = try_get( + data, lambda x: x['streamConfiguration']['properties'], list) + if properties and 'geoblocked' in properties: + raise self.raise_geo_restricted() + self._sort_formats(info['formats']) info.update({ @@ -223,7 +246,7 @@ class BTArticleIE(InfoExtractor): 'ext': 'mp4', 'title': 'Alrekstad internat', 'description': 'md5:dc81a9056c874fedb62fc48a300dac58', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 191, 'timestamp': 1289991323, 'upload_date': '20101117', diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index e2b2ce098..8a00c8fee 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -1,12 +1,93 @@ +# coding: utf-8 from __future__ import unicode_literals import re +import time +import hashlib +import json +from .adobepass import AdobePassIE from .common import InfoExtractor -from ..utils import ExtractorError +from ..compat import compat_HTTPError +from ..utils import ( + int_or_none, + parse_age_limit, + str_or_none, + parse_duration, + ExtractorError, + extract_attributes, +) -class ViceIE(InfoExtractor): +class ViceBaseIE(AdobePassIE): + def _extract_preplay_video(self, url, webpage): + watch_hub_data = extract_attributes(self._search_regex( + r'(?s)()', webpage, 'watch hub')) + video_id = watch_hub_data['vms-id'] + title = watch_hub_data['video-title'] + + query = {} + is_locked = watch_hub_data.get('video-locked') == '1' + if is_locked: + resource = self._get_mvpd_resource( + 'VICELAND', title, video_id, + watch_hub_data.get('video-rating')) + query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource) + + # signature generation algorithm is reverse engineered from signatureGenerator in + # webpack:///../shared/~/vice-player/dist/js/vice-player.js in + # https://www.viceland.com/assets/common/js/web.vendor.bundle.js + exp = int(time.time()) + 14400 + query.update({ + 'exp': exp, + 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), + }) + + try: + host = 'www.viceland' if is_locked else self._PREPLAY_HOST + preplay = self._download_json('https://%s.com/en_us/preplay/%s' % (host, video_id), video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + error = json.loads(e.cause.read().decode()) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True) + raise + + video_data = preplay['video'] + base = video_data['base'] + uplynk_preplay_url = preplay['preplayURL'] + episode = video_data.get('episode', {}) + channel = video_data.get('channel', {}) + + subtitles = {} + cc_url = preplay.get('ccURL') + if cc_url: + subtitles['en'] = [{ + 'url': cc_url, + }] + + return { + '_type': 'url_transparent', + 'url': uplynk_preplay_url, + 'id': video_id, + 'title': title, + 'description': base.get('body'), + 'thumbnail': watch_hub_data.get('cover-image') or watch_hub_data.get('thumbnail'), + 'duration': parse_duration(video_data.get('video_duration') or watch_hub_data.get('video-duration')), + 'timestamp': int_or_none(video_data.get('created_at')), + 'age_limit': parse_age_limit(video_data.get('video_rating')), + 'series': video_data.get('show_title') or watch_hub_data.get('show-title'), + 'episode_number': int_or_none(episode.get('episode_number') or watch_hub_data.get('episode')), + 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), + 'season_number': int_or_none(watch_hub_data.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + 'uploader': channel.get('base', {}).get('title') or watch_hub_data.get('channel-title'), + 'uploader_id': str_or_none(channel.get('id')), + 'subtitles': subtitles, + 'ie_key': 'UplynkPreplay', + } + + +class ViceIE(ViceBaseIE): _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P[^/?#&]+)' _TESTS = [{ @@ -21,7 +102,7 @@ class ViceIE(InfoExtractor): 'add_ie': ['Ooyala'], }, { 'url': 'http://www.vice.com/video/how-to-hack-a-car', - 'md5': '6fb2989a3fed069fb8eab3401fc2d3c9', + 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', 'info_dict': { 'id': '3jstaBeXgAs', 'ext': 'mp4', @@ -32,6 +113,22 @@ class ViceIE(InfoExtractor): 'upload_date': '20140529', }, 'add_ie': ['Youtube'], + }, { + 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', + 'md5': '', + 'info_dict': { + 'id': '5816510690b70e6c5fd39a56', + 'ext': 'mp4', + 'uploader': 'Waypoint', + 'title': 'The Signal From Tölva', + 'uploader_id': '57f7d621e05ca860fa9ccaf9', + 'timestamp': 1477941983938, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], }, { 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', 'only_matching': True, @@ -42,21 +139,21 @@ class ViceIE(InfoExtractor): 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show', 'only_matching': True, }] + _PREPLAY_HOST = 'video.vice' def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - try: - embed_code = self._search_regex( - r'embedCode=([^&\'"]+)', webpage, - 'ooyala embed code', default=None) - if embed_code: - return self.url_result('ooyala:%s' % embed_code, 'Ooyala') - youtube_id = self._search_regex( - r'data-youtube-id="([^"]+)"', webpage, 'youtube id') + webpage, urlh = self._download_webpage_handle(url, video_id) + embed_code = self._search_regex( + r'embedCode=([^&\'"]+)', webpage, + 'ooyala embed code', default=None) + if embed_code: + return self.url_result('ooyala:%s' % embed_code, 'Ooyala') + youtube_id = self._search_regex( + r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None) + if youtube_id: return self.url_result(youtube_id, 'Youtube') - except ExtractorError: - raise ExtractorError('The page doesn\'t contain a video', expected=True) + return self._extract_preplay_video(urlh.geturl(), webpage) class ViceShowIE(InfoExtractor): diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py new file mode 100644 index 000000000..0eff055a6 --- /dev/null +++ b/youtube_dl/extractor/viceland.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .vice import ViceBaseIE + + +class VicelandIE(ViceBaseIE): + _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P[a-f0-9]+)' + _TEST = { + 'url': 'https://www.viceland.com/en_us/video/cyberwar-trailer/57608447973ee7705f6fbd4e', + 'info_dict': { + 'id': '57608447973ee7705f6fbd4e', + 'ext': 'mp4', + 'title': 'CYBERWAR (Trailer)', + 'description': 'Tapping into the geopolitics of hacking and surveillance, Ben Makuch travels the world to meet with hackers, government officials, and dissidents to investigate the ecosystem of cyberwarfare.', + 'age_limit': 14, + 'timestamp': 1466008539, + 'upload_date': '20160615', + 'uploader_id': '11', + 'uploader': 'Viceland', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + } + _PREPLAY_HOST = 'www.viceland' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return self._extract_preplay_video(url, webpage) diff --git a/youtube_dl/extractor/vidbit.py b/youtube_dl/extractor/vidbit.py index e7ac5a842..91f45b7cc 100644 --- a/youtube_dl/extractor/vidbit.py +++ b/youtube_dl/extractor/vidbit.py @@ -20,7 +20,7 @@ class VidbitIE(InfoExtractor): 'ext': 'mp4', 'title': 'Intro to VidBit', 'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': r're:https?://.*\.jpg$', 'upload_date': '20160618', 'view_count': int, 'comment_count': int, diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 8d92aee87..67808e7e6 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -26,7 +26,7 @@ class ViddlerIE(InfoExtractor): 'timestamp': 1335371429, 'upload_date': '20120425', 'duration': 100.89, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'view_count': int, 'comment_count': int, 'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'], diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py new file mode 100644 index 000000000..311df58f4 --- /dev/null +++ b/youtube_dl/extractor/videa.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + mimetype2ext, + parse_codecs, + xpath_element, + xpath_text, +) + + +class VideaIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + videa\.hu/ + (?: + videok/(?:[^/]+/)*[^?#&]+-| + player\?.*?\bv=| + player/v/ + ) + (?P[^?#&]+) + ''' + _TESTS = [{ + 'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ', + 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', + 'info_dict': { + 'id': '8YfIAjxwWGwT8HVQ', + 'ext': 'mp4', + 'title': 'Az őrült kígyász 285 kígyót enged szabadon', + 'thumbnail': 'http://videa.hu/static/still/1.4.1.1007274.1204470.3', + 'duration': 21, + }, + }, { + 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', + 'only_matching': True, + }, { + 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', + 'only_matching': True, + }, { + 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r']+src=(["\'])(?P(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._download_xml( + 'http://videa.hu/videaplayer_get_xml.php', video_id, + query={'v': video_id}) + + video = xpath_element(info, './/video', 'video', fatal=True) + sources = xpath_element(info, './/video_sources', 'sources', fatal=True) + + title = xpath_text(video, './title', fatal=True) + + formats = [] + for source in sources.findall('./video_source'): + source_url = source.text + if not source_url: + continue + f = parse_codecs(source.get('codecs')) + f.update({ + 'url': source_url, + 'ext': mimetype2ext(source.get('mimetype')) or 'mp4', + 'format_id': source.get('name'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + formats.append(f) + self._sort_formats(formats) + + thumbnail = xpath_text(video, './poster_src') + duration = int_or_none(xpath_text(video, './duration')) + + age_limit = None + is_adult = xpath_text(video, './is_adult_content', default=None) + if is_adult: + age_limit = 18 if is_adult == '1' else 0 + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py index 2ed5d9643..a19411a05 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/youtube_dl/extractor/videodetective.py @@ -6,7 +6,7 @@ from .internetvideoarchive import InternetVideoArchiveIE class VideoDetectiveIE(InfoExtractor): - _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?videodetective\.com/[^/]+/[^/]+/(?P\d+)' _TEST = { 'url': 'http://www.videodetective.com/movies/kick-ass-2/194487', diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 4f0dcd18c..c02830ddd 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -19,7 +19,7 @@ class VideoMegaIE(InfoExtractor): 'id': 'AOSQBJYKIDDIKYJBQSOA', 'ext': 'mp4', 'title': '1254207', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA&width=1070&height=600', diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py index 04e95c66e..9b56630de 100644 --- a/youtube_dl/extractor/videomore.py +++ b/youtube_dl/extractor/videomore.py @@ -6,8 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, - parse_age_limit, - parse_iso8601, + xpath_element, xpath_text, ) @@ -17,38 +16,32 @@ class VideomoreIE(InfoExtractor): _VALID_URL = r'videomore:(?P\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P\d+)(?:[/?#&]|\.(?:xml|json)|$)' _TESTS = [{ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', - 'md5': '70875fbf57a1cd004709920381587185', + 'md5': '44455a346edc0d509ac5b5a5b531dc35', 'info_dict': { 'id': '367617', 'ext': 'flv', - 'title': 'В гостях Алексей Чумаков и Юлия Ковальчук', - 'description': 'В гостях – лучшие романтические комедии года, «Выживший» Иньярриту и «Стив Джобс» Дэнни Бойла.', + 'title': 'Кино в деталях 5 сезон В гостях Алексей Чумаков и Юлия Ковальчук', 'series': 'Кино в деталях', 'episode': 'В гостях Алексей Чумаков и Юлия Ковальчук', - 'episode_number': None, - 'season': 'Сезон 2015', - 'season_number': 5, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2910, - 'age_limit': 16, 'view_count': int, + 'comment_count': int, + 'age_limit': 16, }, }, { 'url': 'http://videomore.ru/embed/259974', 'info_dict': { 'id': '259974', 'ext': 'flv', - 'title': '80 серия', - 'description': '«Медведей» ждет решающий матч. Макеев выясняет отношения со Стрельцовым. Парни узнают подробности прошлого Макеева.', + 'title': 'Молодежка 2 сезон 40 серия', 'series': 'Молодежка', - 'episode': '80 серия', - 'episode_number': 40, - 'season': '2 сезон', - 'season_number': 2, - 'thumbnail': 're:^https?://.*\.jpg', + 'episode': '40 серия', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2809, - 'age_limit': 16, 'view_count': int, + 'comment_count': int, + 'age_limit': 16, }, 'params': { 'skip_download': True, @@ -58,14 +51,9 @@ class VideomoreIE(InfoExtractor): 'info_dict': { 'id': '341073', 'ext': 'flv', - 'title': 'Команда проиграла из-за Бакина?', - 'description': 'Молодежка 3 сезон скоро', - 'series': 'Молодежка', + 'title': 'Промо Команда проиграла из-за Бакина?', 'episode': 'Команда проиграла из-за Бакина?', - 'episode_number': None, - 'season': 'Промо', - 'season_number': 99, - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 29, 'age_limit': 16, 'view_count': int, @@ -96,8 +84,13 @@ class VideomoreIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+data=(["\'])https?://videomore.ru/player\.swf\?.*config=(?Phttps?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1', + r']+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=(?Phttps?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1', webpage) + if not mobj: + mobj = re.search( + r']+src=([\'"])(?Phttps?://videomore\.ru/embed/\d+)', + webpage) + if mobj: return mobj.group('url') @@ -109,43 +102,33 @@ class VideomoreIE(InfoExtractor): 'http://videomore.ru/video/tracks/%s.xml' % video_id, video_id, 'Downloading video XML') - video_url = xpath_text(video, './/video_url', 'video url', fatal=True) + item = xpath_element(video, './/playlist/item', fatal=True) + + title = xpath_text( + item, ('./title', './episode_name'), 'title', fatal=True) + + video_url = xpath_text(item, './video_url', 'video url', fatal=True) formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds') self._sort_formats(formats) - data = self._download_json( - 'http://videomore.ru/video/tracks/%s.json' % video_id, - video_id, 'Downloading video JSON') + thumbnail = xpath_text(item, './thumbnail_url') + duration = int_or_none(xpath_text(item, './duration')) + view_count = int_or_none(xpath_text(item, './views')) + comment_count = int_or_none(xpath_text(item, './count_comments')) + age_limit = int_or_none(xpath_text(item, './min_age')) - title = data.get('title') or data['project_title'] - description = data.get('description') or data.get('description_raw') - timestamp = parse_iso8601(data.get('published_at')) - duration = int_or_none(data.get('duration')) - view_count = int_or_none(data.get('views')) - age_limit = parse_age_limit(data.get('min_age')) - thumbnails = [{ - 'url': thumbnail, - } for thumbnail in data.get('big_thumbnail_urls', [])] - - series = data.get('project_title') - episode = data.get('title') - episode_number = int_or_none(data.get('episode_of_season') or None) - season = data.get('season_title') - season_number = int_or_none(data.get('season_pos') or None) + series = xpath_text(item, './project_name') + episode = xpath_text(item, './episode_name') return { 'id': video_id, 'title': title, - 'description': description, 'series': series, 'episode': episode, - 'episode_number': episode_number, - 'season': season, - 'season_number': season_number, - 'thumbnails': thumbnails, - 'timestamp': timestamp, + 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, + 'comment_count': comment_count, 'age_limit': age_limit, 'formats': formats, } @@ -162,7 +145,7 @@ class VideomoreVideoIE(InfoExtractor): 'ext': 'flv', 'title': 'Ёлки 3', 'description': '', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 5579, 'age_limit': 6, 'view_count': int, @@ -185,7 +168,7 @@ class VideomoreVideoIE(InfoExtractor): 'ext': 'flv', 'title': '1 серия. Здравствуй, Аквавилль!', 'description': 'md5:c6003179538b5d353e7bcd5b1372b2d7', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 754, 'age_limit': 6, 'view_count': int, diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py deleted file mode 100644 index 0f798711b..000000000 --- a/youtube_dl/extractor/videott.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import unicode_literals - -import re -import base64 - -from .common import InfoExtractor -from ..utils import ( - unified_strdate, - int_or_none, -) - - -class VideoTtIE(InfoExtractor): - _WORKING = False - ID_NAME = 'video.tt' - IE_DESC = 'video.tt - Your True Tube' - _VALID_URL = r'https?://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P[\da-zA-Z]{9})' - - _TESTS = [{ - 'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8', - 'md5': 'b13aa9e2f267effb5d1094443dff65ba', - 'info_dict': { - 'id': 'amd5YujV8', - 'ext': 'flv', - 'title': 'Motivational video Change your mind in just 2.50 mins', - 'description': '', - 'upload_date': '20130827', - 'uploader': 'joseph313', - } - }, { - 'url': 'http://video.tt/embed/amd5YujV8', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - settings = self._download_json( - 'http://www.video.tt/player_control/settings.php?v=%s' % video_id, video_id, - 'Downloading video JSON')['settings'] - - video = settings['video_details']['video'] - - formats = [ - { - 'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'), - 'ext': 'flv', - 'format_id': res['l'], - } for res in settings['res'] if res['u'] - ] - - return { - 'id': video_id, - 'title': video['title'], - 'description': video['description'], - 'thumbnail': settings['config']['thumbnail'], - 'upload_date': unified_strdate(video['added']), - 'uploader': video['owner'], - 'view_count': int_or_none(video['view_count']), - 'comment_count': None if video.get('comment_count') == '--' else int_or_none(video['comment_count']), - 'like_count': int_or_none(video['liked']), - 'dislike_count': int_or_none(video['disliked']), - 'formats': formats, - } diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index 6898042de..4e4b4e38c 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -18,7 +18,7 @@ class VidioIE(InfoExtractor): 'ext': 'mp4', 'title': 'DJ_AMBRED - Booyah (Live 2015)', 'description': 'md5:27dc15f819b6a78a626490881adbadf8', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 149, 'like_count': int, }, diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index b1156d531..e9ff336c4 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -23,7 +23,7 @@ class VidmeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Fishing for piranha - the easy way', 'description': 'source: https://www.facebook.com/photo.php?v=312276045600871', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1406313244, 'upload_date': '20140725', 'age_limit': 0, @@ -39,7 +39,7 @@ class VidmeIE(InfoExtractor): 'id': 'Gc6M', 'ext': 'mp4', 'title': 'O Mere Dil ke chain - Arnav and Khushi VM', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1441211642, 'upload_date': '20150902', 'uploader': 'SunshineM', @@ -61,7 +61,7 @@ class VidmeIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Carver', 'description': 'md5:e9c24870018ae8113be936645b93ba3c', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1433203629, 'upload_date': '20150602', 'uploader': 'Thomas', @@ -82,7 +82,7 @@ class VidmeIE(InfoExtractor): 'id': 'Wmur', 'ext': 'mp4', 'title': 'naked smoking & stretching', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1430931613, 'upload_date': '20150506', 'uploader': 'naked-yogi', @@ -115,7 +115,7 @@ class VidmeIE(InfoExtractor): 'id': 'e5g', 'ext': 'mp4', 'title': 'Video upload (e5g)', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1401480195, 'upload_date': '20140530', 'uploader': None, diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index d49cc6cbc..9950c62ad 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -1,10 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .jwplatform import JWPlatformBaseIE from ..utils import ( decode_packed_codes, js_to_json, + NO_DEFAULT, + PACKED_CODES_RE, ) @@ -35,10 +39,17 @@ class VidziIE(JWPlatformBaseIE): title = self._html_search_regex( r'(?s)

(.*?)

', webpage, 'title') - code = decode_packed_codes(webpage).replace('\\\'', '\'') - jwplayer_data = self._parse_json( - self._search_regex(r'setup\(([^)]+)\)', code, 'jwplayer data'), - video_id, transform_source=js_to_json) + packed_codes = [mobj.group(0) for mobj in re.finditer( + PACKED_CODES_RE, webpage)] + for num, pc in enumerate(packed_codes, 1): + code = decode_packed_codes(pc).replace('\\\'', '\'') + jwplayer_data = self._parse_json( + self._search_regex( + r'setup\(([^)]+)\)', code, 'jwplayer data', + default=NO_DEFAULT if num == len(packed_codes) else '{}'), + video_id, transform_source=js_to_json) + if jwplayer_data: + break info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) info_dict['title'] = title diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 6645c6186..d26fb49b3 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -48,8 +48,8 @@ class VierIE(InfoExtractor): [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], webpage, 'filename') - playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename) - formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4') + playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) + formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash']) self._sort_formats(formats) title = self._og_search_title(webpage, default=display_id) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 19500eba8..18735cfb2 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -14,7 +14,7 @@ from ..utils import ( class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = '(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv' + _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv' class ViewLiftEmbedIE(ViewLiftBaseIE): @@ -110,7 +110,7 @@ class ViewLiftIE(ViewLiftBaseIE): 'ext': 'mp4', 'title': 'Lost for Life', 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 4489, 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] } @@ -123,7 +123,7 @@ class ViewLiftIE(ViewLiftBaseIE): 'ext': 'mp4', 'title': 'India', 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 979, 'categories': ['Documentary', 'Sports', 'Politics'] } @@ -160,7 +160,7 @@ class ViewLiftIE(ViewLiftBaseIE): snag = self._parse_json( self._search_regex( - 'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), + r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), display_id) for item in snag: diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index a93196a07..52dd95e2f 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -157,7 +157,7 @@ class ViewsterIE(InfoExtractor): formats.extend(m3u8_formats) else: qualities_basename = self._search_regex( - '/([^/]+)\.csmil/', + r'/([^/]+)\.csmil/', manifest_url, 'qualities basename', default=None) if not qualities_basename: continue diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index a4f914d14..4adcd1830 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -40,7 +40,7 @@ class ViideaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'timestamp': 1372349289, 'upload_date': '20130627', 'duration': 565, @@ -58,7 +58,7 @@ class ViideaIE(InfoExtractor): 'ext': 'flv', 'title': 'NLP at Google', 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'timestamp': 1284375600, 'upload_date': '20100913', 'duration': 5352, @@ -74,7 +74,7 @@ class ViideaIE(InfoExtractor): 'id': '23181', 'title': 'Deep Learning Summer School, Montreal 2015', 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'timestamp': 1438560000, }, 'playlist_count': 30, @@ -85,7 +85,7 @@ class ViideaIE(InfoExtractor): 'id': '9737', 'display_id': 'mlss09uk_bishop_ibi', 'title': 'Introduction To Bayesian Inference', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'timestamp': 1251622800, }, 'playlist': [{ @@ -94,7 +94,7 @@ class ViideaIE(InfoExtractor): 'display_id': 'mlss09uk_bishop_ibi_part1', 'ext': 'wmv', 'title': 'Introduction To Bayesian Inference (Part 1)', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'duration': 4622, 'timestamp': 1251622800, 'upload_date': '20090830', @@ -105,7 +105,7 @@ class ViideaIE(InfoExtractor): 'display_id': 'mlss09uk_bishop_ibi_part2', 'ext': 'wmv', 'title': 'Introduction To Bayesian Inference (Part 2)', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'duration': 5641, 'timestamp': 1251622800, 'upload_date': '20090830', diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 4351ac457..9c48701c1 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import time -import hmac import hashlib +import hmac import itertools +import json +import re +import time from .common import InfoExtractor from ..utils import ( @@ -276,10 +277,14 @@ class VikiIE(VikiBaseIE): height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) for protocol, format_dict in stream_dict.items(): + # rtmps URLs does not seem to work + if protocol == 'rtmps': + continue + format_url = format_dict['url'] if format_id == 'm3u8': m3u8_formats = self._extract_m3u8_formats( - format_dict['url'], video_id, 'mp4', - entry_protocol='m3u8_native', preference=-1, + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='m3u8-%s' % protocol, fatal=False) # Despite CODECS metadata in m3u8 all video-only formats # are actually video+audio @@ -287,9 +292,23 @@ class VikiIE(VikiBaseIE): if f.get('acodec') == 'none' and f.get('vcodec') != 'none': f['acodec'] = None formats.extend(m3u8_formats) + elif format_url.startswith('rtmp'): + mobj = re.search( + r'^(?Prtmp://[^/]+/(?P.+?))/(?Pmp4:.+)$', + format_url) + if not mobj: + continue + formats.append({ + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': url, + }) else: formats.append({ - 'url': format_dict['url'], + 'url': format_url, 'format_id': '%s-%s' % (format_id, protocol), 'height': height, }) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 7e854f326..61cc469bf 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import json @@ -21,12 +21,12 @@ from ..utils import ( sanitized_Request, smuggle_url, std_headers, - unified_strdate, + try_get, + unified_timestamp, unsmuggle_url, urlencode_postdata, unescapeHTML, parse_filesize, - try_get, ) @@ -92,29 +92,30 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _vimeo_sort_formats(self, formats): # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps # at the same time without actual units specified. This lead to wrong sorting. - self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id')) + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) def _parse_config(self, config, video_id): + video_data = config['video'] # Extract title - video_title = config['video']['title'] + video_title = video_data['title'] # Extract uploader, uploader_url and uploader_id - video_uploader = config['video'].get('owner', {}).get('name') - video_uploader_url = config['video'].get('owner', {}).get('url') + video_uploader = video_data.get('owner', {}).get('name') + video_uploader_url = video_data.get('owner', {}).get('url') video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None # Extract video thumbnail - video_thumbnail = config['video'].get('thumbnail') + video_thumbnail = video_data.get('thumbnail') if video_thumbnail is None: - video_thumbs = config['video'].get('thumbs') + video_thumbs = video_data.get('thumbs') if video_thumbs and isinstance(video_thumbs, dict): _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] # Extract video duration - video_duration = int_or_none(config['video'].get('duration')) + video_duration = int_or_none(video_data.get('duration')) formats = [] - config_files = config['video'].get('files') or config['request'].get('files', {}) + config_files = video_data.get('files') or config['request'].get('files', {}) for f in config_files.get('progressive', []): video_url = f.get('url') if not video_url: @@ -127,10 +128,33 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'fps': int_or_none(f.get('fps')), 'tbr': int_or_none(f.get('bitrate')), }) - m3u8_url = config_files.get('hls', {}).get('url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + for files_type in ('hls', 'dash'): + for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + manifest_url = cdn_data.get('url') + if not manifest_url: + continue + format_id = '%s-%s' % (files_type, cdn_name) + if files_type == 'hls': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + 'm3u8_native', m3u8_id=format_id, + note='Downloading %s m3u8 information' % cdn_name, + fatal=False)) + elif files_type == 'dash': + mpd_pattern = r'/%s/(?:sep/)?video/' % video_id + mpd_manifest_urls = [] + if re.search(mpd_pattern, manifest_url): + for suffix, repl in (('', 'video'), ('_sep', 'sep/video')): + mpd_manifest_urls.append((format_id + suffix, re.sub( + mpd_pattern, '/%s/%s/' % (video_id, repl), manifest_url))) + else: + mpd_manifest_urls = [(format_id, manifest_url)] + for f_id, m_url in mpd_manifest_urls: + formats.extend(self._extract_mpd_formats( + m_url.replace('/master.json', '/master.mpd'), video_id, f_id, + 'Downloading %s MPD information' % cdn_name, + fatal=False)) subtitles = {} text_tracks = config['request'].get('text_tracks') @@ -189,11 +213,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", 'description': 'md5:2d3305bad981a06ff79f027f19865021', + 'timestamp': 1355990239, 'upload_date': '20121220', - 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user7108434', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434', 'uploader_id': 'user7108434', 'uploader': 'Filippo Valsorda', 'duration': 10, + 'license': 'by-sa', }, }, { @@ -203,7 +229,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'info_dict': { 'id': '68093876', 'ext': 'mp4', - 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/openstreetmapus', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/openstreetmapus', 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', @@ -220,7 +246,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', 'uploader': 'The BLN & Business of Software', - 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, 'description': None, @@ -234,12 +260,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', + 'timestamp': 1371200155, 'upload_date': '20130614', - 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user18948128', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'This is "youtube-dl password protected test video" by on Vimeo, the home for high quality videos and the people who love them.', + 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', }, 'params': { 'videopassword': 'youtube-dl', @@ -253,10 +280,11 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'Key & Peele: Terrorist Interrogation', 'description': 'md5:8678b246399b070816b12313e8b4eb5c', - 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/atencio', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', - 'upload_date': '20130927', + 'timestamp': 1380339469, + 'upload_date': '20130928', 'duration': 187, }, }, @@ -268,8 +296,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'The New Vimeo Player (You Know, For Videos)', 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', + 'timestamp': 1381846109, 'upload_date': '20131015', - 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/staff', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, @@ -284,21 +313,22 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'Pier Solar OUYA Official Trailer', 'uploader': 'Tulio Gonçalves', - 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user28849593', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user28849593', 'uploader_id': 'user28849593', }, }, { # contains original format 'url': 'https://vimeo.com/33951933', - 'md5': '2d9f5475e0537f013d0073e812ab89e6', + 'md5': '53c688fa95a55bf4b7293d37a89c5c53', 'info_dict': { 'id': '33951933', 'ext': 'mp4', 'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute', 'uploader': 'The DMCI', - 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/dmci', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci', 'uploader_id': 'dmci', + 'timestamp': 1324343742, 'upload_date': '20111220', 'description': 'md5:ae23671e82d05415868f7ad1aec21147', }, @@ -309,11 +339,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/channels/tributes/6213729', 'info_dict': { 'id': '6213729', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'Vimeo Tribute: The Shining', 'uploader': 'Casey Donahue', - 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue', 'uploader_id': 'caseydonahue', + 'timestamp': 1250886430, 'upload_date': '20090821', 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', }, @@ -322,6 +353,22 @@ class VimeoIE(VimeoBaseInfoExtractor): }, 'expected_warnings': ['Unable to download JSON metadata'], }, + { + # redirects to ondemand extractor and should be passed through it + # for successful extraction + 'url': 'https://vimeo.com/73445910', + 'info_dict': { + 'id': '73445910', + 'ext': 'mp4', + 'title': 'The Reluctant Revolutionary', + 'uploader': '10Ft Films', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/tenfootfilms', + 'uploader_id': 'tenfootfilms', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', 'only_matching': True, @@ -351,24 +398,32 @@ class VimeoIE(VimeoBaseInfoExtractor): ] @staticmethod - def _extract_vimeo_url(url, webpage): + def _smuggle_referrer(url, referrer_url): + return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) + + @staticmethod + def _extract_urls(url, webpage): + urls = [] # Look for embedded (iframe) Vimeo player - mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) - if mobj: - player_url = unescapeHTML(mobj.group('url')) - surl = smuggle_url(player_url, {'http_headers': {'Referer': url}}) - return surl - # Look for embedded (swf embed) Vimeo player - mobj = re.search( - r']+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) - if mobj: - return mobj.group(1) - # Look more for non-standard embedded Vimeo player - mobj = re.search( - r']+src=(?P[\'"])(?P(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)(?P=q1)', webpage) - if mobj: - return mobj.group('url') + for mobj in re.finditer( + r']+?src=(["\'])(?P(?:https?:)?//player\.vimeo\.com/video/.+?)\1', + webpage): + urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url)) + PLAIN_EMBED_RE = ( + # Look for embedded (swf embed) Vimeo player + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1', + # Look more for non-standard embedded Vimeo player + r']+src=(["\'])(?P(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1', + ) + for embed_re in PLAIN_EMBED_RE: + for mobj in re.finditer(embed_re, webpage): + urls.append(mobj.group('url')) + return urls + + @staticmethod + def _extract_url(url, webpage): + urls = VimeoIE._extract_urls(url, webpage) + return urls[0] if urls else None def _verify_player_video_password(self, url, video_id): password = self._downloader.params.get('videopassword') @@ -406,7 +461,12 @@ class VimeoIE(VimeoBaseInfoExtractor): # Retrieve video webpage to extract further information request = sanitized_Request(url, headers=headers) try: - webpage = self._download_webpage(request, video_id) + webpage, urlh = self._download_webpage_handle(request, video_id) + # Some URLs redirect to ondemand can't be extracted with + # this extractor right away thus should be passed through + # ondemand extractor (e.g. https://vimeo.com/73445910) + if VimeoOndemandIE.suitable(urlh.geturl()): + return self.url_result(urlh.geturl(), VimeoOndemandIE.ie_key()) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() @@ -433,6 +493,9 @@ class VimeoIE(VimeoBaseInfoExtractor): '%s said: %s' % (self.IE_NAME, seed_status['title']), expected=True) + cc_license = None + timestamp = None + # Extract the config JSON try: try: @@ -446,14 +509,18 @@ class VimeoIE(VimeoBaseInfoExtractor): vimeo_clip_page_config = self._search_regex( r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, 'vimeo clip page config') - config_url = self._parse_json( - vimeo_clip_page_config, video_id)['player']['config_url'] + page_config = self._parse_json(vimeo_clip_page_config, video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') + timestamp = try_get( + page_config, lambda x: x['clip']['uploaded_on'], + compat_str) config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: # For pro videos or player.vimeo.com urls # We try to find out to which variable is assigned the config dic - m_variable_name = re.search('(\w)\.video\.id', webpage) + m_variable_name = re.search(r'(\w)\.video\.id', webpage) if m_variable_name is not None: config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1)) else: @@ -516,10 +583,10 @@ class VimeoIE(VimeoBaseInfoExtractor): self._downloader.report_warning('Cannot find video description') # Extract upload date - video_upload_date = None - mobj = re.search(r']+datetime="([^"]+)"', webpage) - if mobj is not None: - video_upload_date = unified_strdate(mobj.group(1)) + if not timestamp: + timestamp = self._search_regex( + r']+datetime="([^"]+)"', webpage, + 'timestamp', default=None) try: view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) @@ -556,15 +623,22 @@ class VimeoIE(VimeoBaseInfoExtractor): info_dict = self._parse_config(config, video_id) formats.extend(info_dict['formats']) self._vimeo_sort_formats(formats) + + if not cc_license: + cc_license = self._search_regex( + r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'license', default=None, group='license') + info_dict.update({ 'id': video_id, 'formats': formats, - 'upload_date': video_upload_date, + 'timestamp': unified_timestamp(timestamp), 'description': video_description, 'webpage_url': url, 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, + 'license': cc_license, }) return info_dict @@ -582,9 +656,26 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'המעבדה - במאי יותם פלדמן', 'uploader': 'גם סרטים', - 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/gumfilms', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', 'uploader_id': 'gumfilms', }, + 'params': { + 'format': 'best[protocol=https]', + }, + }, { + # requires Referer to be passed along with og:video:url + 'url': 'https://vimeo.com/ondemand/36938/126682985', + 'info_dict': { + 'id': '126682985', + 'ext': 'mp4', + 'title': 'Rävlock, rätt läte på rätt plats', + 'uploader': 'Lindroth & Norin', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user14430847', + 'uploader_id': 'user14430847', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -599,7 +690,12 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key()) + return self.url_result( + # Some videos require Referer to be passed along with og:video:url + # similarly to generic vimeo embeds (e.g. + # https://vimeo.com/ondemand/36938/126682985). + VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url), + VimeoIE.ie_key()) class VimeoChannelIE(VimeoBaseInfoExtractor): @@ -664,12 +760,12 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): # Try extracting href first since not all videos are available via # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) clips = re.findall( - r'id="clip_(\d+)"[^>]*>\s*]+href="(/(?:[^/]+/)*\1)', webpage) + r'id="clip_(\d+)"[^>]*>\s*]+href="(/(?:[^/]+/)*\1)(?:[^>]+\btitle="([^"]+)")?', webpage) if clips: - for video_id, video_url in clips: + for video_id, video_url, video_title in clips: yield self.url_result( compat_urlparse.urljoin(base_url, video_url), - VimeoIE.ie_key(), video_id=video_id) + VimeoIE.ie_key(), video_id=video_id, video_title=video_title) # More relaxed fallback else: for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): @@ -794,7 +890,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'title': 're:(?i)^Death by dogma versus assembling agile . Sander Hoogendoorn', 'uploader': 'DevWeek Events', 'duration': 2773, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader_id': 'user22258446', } }, { @@ -810,6 +906,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'params': { 'videopassword': 'holygrail', }, + 'skip': 'video gone', }] def _real_initialize(self): @@ -818,8 +915,13 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): def _get_config_url(self, webpage_url, video_id, video_password_verified=False): webpage = self._download_webpage(webpage_url, video_id) config_url = self._html_search_regex( - r'data-config-url="([^"]+)"', webpage, 'config URL', - default=NO_DEFAULT if video_password_verified else None) + r'data-config-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'config URL', default=None, group='url') + if not config_url: + data = self._parse_json(self._search_regex( + r'window\s*=\s*_extend\(window,\s*({.+?})\);', webpage, 'data', + default=NO_DEFAULT if video_password_verified else '{}'), video_id) + config_url = data.get('vimeo_esi', {}).get('config', {}).get('configUrl') if config_url is None: self._verify_video_password(webpage_url, video_id, webpage) config_url = self._get_config_url( diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py index 92321d66e..c74b43766 100644 --- a/youtube_dl/extractor/vimple.py +++ b/youtube_dl/extractor/vimple.py @@ -28,23 +28,24 @@ class SprutoBaseIE(InfoExtractor): class VimpleIE(SprutoBaseIE): IE_DESC = 'Vimple - one-click video hosting' - _VALID_URL = r'https?://(?:player\.vimple\.ru/iframe|vimple\.ru)/(?P[\da-f-]{32,36})' - _TESTS = [ - { - 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', - 'md5': '2e750a330ed211d3fd41821c6ad9a279', - 'info_dict': { - 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf', - 'ext': 'mp4', - 'title': 'Sunset', - 'duration': 20, - 'thumbnail': 're:https?://.*?\.jpg', - }, - }, { - 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9', - 'only_matching': True, - } - ] + _VALID_URL = r'https?://(?:player\.vimple\.(?:ru|co)/iframe|vimple\.(?:ru|co))/(?P[\da-f-]{32,36})' + _TESTS = [{ + 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', + 'md5': '2e750a330ed211d3fd41821c6ad9a279', + 'info_dict': { + 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf', + 'ext': 'mp4', + 'title': 'Sunset', + 'duration': 20, + 'thumbnail': r're:https?://.*?\.jpg', + }, + }, { + 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9', + 'only_matching': True, + }, { + 'url': 'http://vimple.co/04506a053f124483b8fb05ed73899f19', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 0183f052a..4957a07f7 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -6,8 +6,9 @@ import itertools from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, - unified_strdate, + unified_timestamp, ) @@ -20,50 +21,16 @@ class VineIE(InfoExtractor): 'id': 'b9KOOWX7HUx', 'ext': 'mp4', 'title': 'Chicken.', - 'alt_title': 'Vine by Jack Dorsey', + 'alt_title': 'Vine by Jack', + 'timestamp': 1368997951, 'upload_date': '20130519', - 'uploader': 'Jack Dorsey', + 'uploader': 'Jack', 'uploader_id': '76', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, - }, { - 'url': 'https://vine.co/v/MYxVapFvz2z', - 'md5': '7b9a7cbc76734424ff942eb52c8f1065', - 'info_dict': { - 'id': 'MYxVapFvz2z', - 'ext': 'mp4', - 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', - 'alt_title': 'Vine by Mars Ruiz', - 'upload_date': '20140815', - 'uploader': 'Mars Ruiz', - 'uploader_id': '1102363502380728320', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, { - 'url': 'https://vine.co/v/bxVjBbZlPUH', - 'md5': 'ea27decea3fa670625aac92771a96b73', - 'info_dict': { - 'id': 'bxVjBbZlPUH', - 'ext': 'mp4', - 'title': '#mw3 #ac130 #killcam #angelofdeath', - 'alt_title': 'Vine by Z3k3', - 'upload_date': '20130430', - 'uploader': 'Z3k3', - 'uploader_id': '936470460173008896', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, { - 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', - 'only_matching': True, }, { 'url': 'https://vine.co/v/e192BnZnZ9V', 'info_dict': { @@ -71,6 +38,7 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', 'alt_title': 'Vine by Pimry_zaa', + 'timestamp': 1436057405, 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', @@ -82,43 +50,60 @@ class VineIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://vine.co/v/MYxVapFvz2z', + 'only_matching': True, + }, { + 'url': 'https://vine.co/v/bxVjBbZlPUH', + 'only_matching': True, + }, { + 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id) - data = self._parse_json( - self._search_regex( - r'window\.POST_DATA\s*=\s*({.+?});\s*', - webpage, 'vine data'), - video_id) + data = self._download_json( + 'https://archive.vine.co/posts/%s.json' % video_id, video_id) - data = data[list(data.keys())[0]] - - formats = [{ - 'format_id': '%(format)s-%(rate)s' % f, - 'vcodec': f.get('format'), - 'quality': f.get('rate'), - 'url': f['videoUrl'], - } for f in data['videoUrls'] if f.get('videoUrl')] + def video_url(kind): + for url_suffix in ('Url', 'URL'): + format_url = data.get('video%s%s' % (kind, url_suffix)) + if format_url: + return format_url + formats = [] + for quality, format_id in enumerate(('low', '', 'dash')): + format_url = video_url(format_id.capitalize()) + if not format_url: + continue + # DASH link returns plain mp4 + if format_id == 'dash' and determine_ext(format_url) == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id or 'standard', + 'quality': quality, + }) self._sort_formats(formats) username = data.get('username') return { 'id': video_id, - 'title': data.get('description') or self._og_search_title(webpage), - 'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None), + 'title': data.get('description'), + 'alt_title': 'Vine by %s' % username if username else None, 'thumbnail': data.get('thumbnailUrl'), - 'upload_date': unified_strdate(data.get('created')), + 'timestamp': unified_timestamp(data.get('created')), 'uploader': username, 'uploader_id': data.get('userIdStr'), - 'view_count': int_or_none(data.get('loops', {}).get('count')), - 'like_count': int_or_none(data.get('likes', {}).get('count')), - 'comment_count': int_or_none(data.get('comments', {}).get('count')), - 'repost_count': int_or_none(data.get('reposts', {}).get('count')), + 'view_count': int_or_none(data.get('loops')), + 'like_count': int_or_none(data.get('likes')), + 'comment_count': int_or_none(data.get('comments')), + 'repost_count': int_or_none(data.get('reposts')), 'formats': formats, } diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py new file mode 100644 index 000000000..3fd889c8e --- /dev/null +++ b/youtube_dl/extractor/viu.py @@ -0,0 +1,249 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class ViuBaseIE(InfoExtractor): + def _real_initialize(self): + viu_auth_res = self._request_webpage( + 'https://www.viu.com/api/apps/v2/authenticate', None, + 'Requesting Viu auth', query={ + 'acct': 'test', + 'appid': 'viu_desktop', + 'fmt': 'json', + 'iid': 'guest', + 'languageid': 'default', + 'platform': 'desktop', + 'userid': 'guest', + 'useridtype': 'guest', + 'ver': '1.0' + }, headers=self.geo_verification_headers()) + self._auth_token = viu_auth_res.info()['X-VIU-AUTH'] + + def _call_api(self, path, *args, **kwargs): + headers = self.geo_verification_headers() + headers.update({ + 'X-VIU-AUTH': self._auth_token + }) + headers.update(kwargs.get('headers', {})) + kwargs['headers'] = headers + response = self._download_json( + 'https://www.viu.com/api/' + path, *args, **kwargs)['response'] + if response.get('status') != 'success': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, response['message']), expected=True) + return response + + +class ViuIE(ViuBaseIE): + _VALID_URL = r'(?:viu:|https?://www\.viu\.com/[a-z]{2}/media/)(?P\d+)' + _TESTS = [{ + 'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059', + 'info_dict': { + 'id': '1116705532', + 'ext': 'mp4', + 'title': 'Citizen Khan - Ep 1', + 'description': 'md5:d7ea1604f49e5ba79c212c551ce2110e', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + 'skip': 'Geo-restricted to India', + }, { + 'url': 'https://www.viu.com/en/media/1130599965', + 'info_dict': { + 'id': '1130599965', + 'ext': 'mp4', + 'title': 'Jealousy Incarnate - Episode 1', + 'description': 'md5:d3d82375cab969415d2720b6894361e9', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + 'skip': 'Geo-restricted to Indonesia', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._call_api( + 'clip/load', video_id, 'Downloading video data', query={ + 'appid': 'viu_desktop', + 'fmt': 'json', + 'id': video_id + })['item'][0] + + title = video_data['title'] + + m3u8_url = None + url_path = video_data.get('urlpathd') or video_data.get('urlpath') + tdirforwhole = video_data.get('tdirforwhole') + # #EXT-X-BYTERANGE is not supported by native hls downloader + # and ffmpeg (#10955) + # hls_file = video_data.get('hlsfile') + hls_file = video_data.get('jwhlsfile') + if url_path and tdirforwhole and hls_file: + m3u8_url = '%s/%s/%s' % (url_path, tdirforwhole, hls_file) + else: + # m3u8_url = re.sub( + # r'(/hlsc_)[a-z]+(\d+\.m3u8)', + # r'\1whe\2', video_data['href']) + m3u8_url = video_data['href'] + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + self._sort_formats(formats) + + subtitles = {} + for key, value in video_data.items(): + mobj = re.match(r'^subtitle_(?P[^_]+)_(?P(vtt|srt))', key) + if not mobj: + continue + subtitles.setdefault(mobj.group('lang'), []).append({ + 'url': value, + 'ext': mobj.group('ext') + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'series': video_data.get('moviealbumshowname'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('episodeno')), + 'duration': int_or_none(video_data.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class ViuPlaylistIE(ViuBaseIE): + IE_NAME = 'viu:playlist' + _VALID_URL = r'https?://www\.viu\.com/[^/]+/listing/playlist-(?P\d+)' + _TEST = { + 'url': 'https://www.viu.com/en/listing/playlist-22461380', + 'info_dict': { + 'id': '22461380', + 'title': 'The Good Wife', + }, + 'playlist_count': 16, + 'skip': 'Geo-restricted to Indonesia', + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist_data = self._call_api( + 'container/load', playlist_id, + 'Downloading playlist info', query={ + 'appid': 'viu_desktop', + 'fmt': 'json', + 'id': 'playlist-' + playlist_id + })['container'] + + entries = [] + for item in playlist_data.get('item', []): + item_id = item.get('id') + if not item_id: + continue + item_id = compat_str(item_id) + entries.append(self.url_result( + 'viu:' + item_id, 'Viu', item_id)) + + return self.playlist_result( + entries, playlist_id, playlist_data.get('title')) + + +class ViuOTTIE(InfoExtractor): + IE_NAME = 'viu:ott' + _VALID_URL = r'https?://(?:www\.)?viu\.com/ott/(?P[a-z]{2})/[a-z]{2}-[a-z]{2}/vod/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.viu.com/ott/sg/en-us/vod/3421/The%20Prime%20Minister%20and%20I', + 'info_dict': { + 'id': '3421', + 'ext': 'mp4', + 'title': 'A New Beginning', + 'description': 'md5:1e7486a619b6399b25ba6a41c0fe5b2c', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + 'skip': 'Geo-restricted to Singapore', + }, { + 'url': 'http://www.viu.com/ott/hk/zh-hk/vod/7123/%E5%A4%A7%E4%BA%BA%E5%A5%B3%E5%AD%90', + 'info_dict': { + 'id': '7123', + 'ext': 'mp4', + 'title': '這就是我的生活之道', + 'description': 'md5:4eb0d8b08cf04fcdc6bbbeb16043434f', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + 'skip': 'Geo-restricted to Hong Kong', + }] + + def _real_extract(self, url): + country_code, video_id = re.match(self._VALID_URL, url).groups() + + product_data = self._download_json( + 'http://www.viu.com/ott/%s/index.php' % country_code, video_id, + 'Downloading video info', query={ + 'r': 'vod/ajax-detail', + 'platform_flag_label': 'web', + 'product_id': video_id, + })['data'] + + video_data = product_data.get('current_product') + if not video_data: + raise ExtractorError('This video is not available in your region.', expected=True) + + stream_data = self._download_json( + 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, + video_id, 'Downloading stream info', query={ + 'ccs_product_id': video_data['ccs_product_id'], + })['data']['stream'] + + stream_sizes = stream_data.get('size', {}) + formats = [] + for vid_format, stream_url in stream_data.get('url', {}).items(): + height = int_or_none(self._search_regex( + r's(\d+)p', vid_format, 'height', default=None)) + formats.append({ + 'format_id': vid_format, + 'url': stream_url, + 'height': height, + 'ext': 'mp4', + 'filesize': int_or_none(stream_sizes.get(vid_format)) + }) + self._sort_formats(formats) + + subtitles = {} + for sub in video_data.get('subtitle', []): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles.setdefault(sub.get('name'), []).append({ + 'url': sub_url, + 'ext': 'srt', + }) + + title = video_data['synopsis'].strip() + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'series': product_data.get('series', {}).get('name'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('number')), + 'duration': int_or_none(stream_data.get('duration')), + 'thumbnail': video_data.get('cover_image_url'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 3ee66e23e..6e6c3a0e1 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -1,8 +1,8 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import collections import re -import json import sys from .common import InfoExtractor @@ -16,15 +16,15 @@ from ..utils import ( get_element_by_class, int_or_none, orderedSet, - parse_duration, remove_start, str_to_int, unescapeHTML, - unified_strdate, + unified_timestamp, urlencode_postdata, ) -from .vimeo import VimeoIE +from .dailymotion import DailymotionIE from .pladform import PladformIE +from .vimeo import VimeoIE class VKBaseIE(InfoExtractor): @@ -52,8 +52,9 @@ class VKBaseIE(InfoExtractor): # what actually happens. # We will workaround this VK issue by resetting the remixlhk cookie to # the first one manually. - cookies = url_handle.headers.get('Set-Cookie') - if cookies: + for header, cookies in url_handle.headers.items(): + if header.lower() != 'set-cookie': + continue if sys.version_info[0] >= 3: cookies = cookies.encode('iso-8859-1') cookies = cookies.decode('utf-8') @@ -61,6 +62,7 @@ class VKBaseIE(InfoExtractor): if remixlhk: value, domain = remixlhk.groups() self._set_cookie(domain, 'remixlhk', value) + break login_page = self._download_webpage( 'https://login.vk.com/?act=login', None, @@ -103,6 +105,7 @@ class VKIE(VKBaseIE): 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'duration': 195, + 'timestamp': 1329060660, 'upload_date': '20120212', 'view_count': int, }, @@ -116,6 +119,7 @@ class VKIE(VKBaseIE): 'uploader': 'Tom Cruise', 'title': 'No name', 'duration': 9, + 'timestamp': 1374374880, 'upload_date': '20130721', 'view_count': int, } @@ -192,6 +196,7 @@ class VKIE(VKBaseIE): 'upload_date': '20150709', 'view_count': int, }, + 'skip': 'Removed', }, { # youtube embed @@ -208,6 +213,23 @@ class VKIE(VKBaseIE): 'view_count': int, }, }, + { + # dailymotion embed + 'url': 'https://vk.com/video-37468416_456239855', + 'info_dict': { + 'id': 'k3lz2cmXyRuJQSjGHUv', + 'ext': 'mp4', + 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', + 'description': 'md5:c651358f03c56f1150b555c26d90a0fd', + 'uploader': 'AniLibria.Tv', + 'upload_date': '20160914', + 'uploader_id': 'x1p5vl5', + 'timestamp': 1473877246, + }, + 'params': { + 'skip_download': True, + }, + }, { # video key is extra_data not url\d+ 'url': 'http://vk.com/video-110305615_171782105', @@ -217,10 +239,30 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'S-Dance, репетиции к The way show', 'uploader': 'THE WAY SHOW | 17 апреля', + 'timestamp': 1454870100, 'upload_date': '20160207', 'view_count': int, }, }, + { + # finished live stream, postlive_mp4 + 'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', + 'md5': '90d22d051fccbbe9becfccc615be6791', + 'info_dict': { + 'id': '456242764', + 'ext': 'mp4', + 'title': 'ИгроМир 2016 — день 1', + 'uploader': 'Игромания', + 'duration': 5239, + 'view_count': int, + }, + }, + { + # live stream, hls and rtmp links, most likely already finished live + # stream by the time you are reading this comment + 'url': 'https://vk.com/video-140332_456239111', + 'only_matching': True, + }, { # removed video, just testing that we match the pattern 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', @@ -298,7 +340,7 @@ class VKIE(VKBaseIE): if youtube_url: return self.url_result(youtube_url, 'Youtube') - vimeo_url = VimeoIE._extract_vimeo_url(url, info_page) + vimeo_url = VimeoIE._extract_url(url, info_page) if vimeo_url is not None: return self.url_result(vimeo_url) @@ -313,6 +355,10 @@ class VKIE(VKBaseIE): m_rutube.group(1).replace('\\', '')) return self.url_result(rutube_url) + dailymotion_urls = DailymotionIE._extract_urls(info_page) + if dailymotion_urls: + return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) @@ -322,46 +368,80 @@ class VKIE(VKBaseIE): opts_url = 'http:' + opts_url return self.url_result(opts_url) - data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars') - data = json.loads(data_json) + # vars does not look to be served anymore since 24.10.2016 + data = self._parse_json( + self._search_regex( + r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'), + video_id, fatal=False) - # Extract upload date - upload_date = None - mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page) - if mobj is not None: - mobj.group(1) + ' ' + mobj.group(2) - upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) + # is served instead + if not data: + data = self._parse_json( + self._search_regex( + r'\s*({.+?})\s*', info_page, 'json', default='{}'), + video_id) + if data: + data = data['player']['params'][0] - view_count = None - views = self._html_search_regex( - r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', - info_page, 'view count', default=None) - if views: - view_count = str_to_int(self._search_regex( - r'([\d,.]+)', views, 'view count', fatal=False)) + if not data: + data = self._parse_json( + self._search_regex( + r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page, + 'player params'), + video_id)['params'][0] + + title = unescapeHTML(data['md_title']) + + # 2 = live + # 3 = post live (finished live) + is_live = data.get('live') == 2 + if is_live: + title = self._live_title(title) + + timestamp = unified_timestamp(self._html_search_regex( + r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, + 'upload date', fatal=False)) + + view_count = str_to_int(self._search_regex( + r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', + info_page, 'view count', fatal=False)) formats = [] - for k, v in data.items(): - if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v: + for format_id, format_url in data.items(): + if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): continue - height = int_or_none(self._search_regex( - r'^(?:url|cache)(\d+)', k, 'height', default=None)) - formats.append({ - 'format_id': k, - 'url': v, - 'height': height, - }) + if (format_id.startswith(('url', 'cache')) or + format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): + height = int_or_none(self._search_regex( + r'^(?:url|cache)(\d+)', format_id, 'height', default=None)) + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'height': height, + }) + elif format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id=format_id, fatal=False, live=is_live)) + elif format_id == 'rtmp': + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': 'flv', + }) self._sort_formats(formats) return { - 'id': compat_str(data['vid']), + 'id': compat_str(data.get('vid') or video_id), 'formats': formats, - 'title': unescapeHTML(data['md_title']), + 'title': title, 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), 'duration': data.get('duration'), - 'upload_date': upload_date, + 'timestamp': timestamp, 'view_count': view_count, + 'is_live': is_live, } @@ -445,6 +525,9 @@ class VKWallPostIE(VKBaseIE): 'skip_download': True, }, }], + 'params': { + 'usenetrc': True, + }, 'skip': 'Requires vk account credentials', }, { # single YouTube embed, no leading - @@ -454,6 +537,9 @@ class VKWallPostIE(VKBaseIE): 'title': 'Sergey Gorbunov - Wall post 85155021_6319', }, 'playlist_count': 1, + 'params': { + 'usenetrc': True, + }, 'skip': 'Requires vk account credentials', }, { # wall page URL @@ -481,37 +567,41 @@ class VKWallPostIE(VKBaseIE): raise ExtractorError('VK said: %s' % error, expected=True) description = clean_html(get_element_by_class('wall_post_text', webpage)) - uploader = clean_html(get_element_by_class( - 'fw_post_author', webpage)) or self._og_search_description(webpage) + uploader = clean_html(get_element_by_class('author', webpage)) thumbnail = self._og_search_thumbnail(webpage) entries = [] - for audio in re.finditer(r'''(?sx) - ]+ - id=(?P["\'])audio_info(?P\d+_\d+).*?(?P=q1)[^>]+ - value=(?P["\'])(?Phttp.+?)(?P=q2) - .+? - ''', webpage): - audio_html = audio.group(0) - audio_id = audio.group('id') - duration = parse_duration(get_element_by_class('duration', audio_html)) - track = self._html_search_regex( - r']+id=["\']title%s[^>]*>([^<]+)' % audio_id, - audio_html, 'title', default=None) - artist = self._html_search_regex( - r'>([^<]+)
\s*&ndash', audio_html, - 'artist', default=None) - entries.append({ - 'id': audio_id, - 'url': audio.group('url'), - 'title': '%s - %s' % (artist, track) if artist and track else audio_id, - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': uploader, - 'artist': artist, - 'track': track, - }) + audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage) + if audio_ids: + al_audio = self._download_webpage( + 'https://vk.com/al_audio.php', post_id, + note='Downloading audio info', fatal=False, + data=urlencode_postdata({ + 'act': 'reload_audio', + 'al': '1', + 'ids': ','.join(audio_ids) + })) + if al_audio: + Audio = collections.namedtuple( + 'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration']) + audios = self._parse_json( + self._search_regex( + r'(.+?)', al_audio, 'audios', default='[]'), + post_id, fatal=False, transform_source=unescapeHTML) + if isinstance(audios, list): + for audio in audios: + a = Audio._make(audio[:6]) + entries.append({ + 'id': '%s_%s' % (a.user_id, a.id), + 'url': a.url, + 'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id, + 'thumbnail': thumbnail, + 'duration': a.duration, + 'uploader': uploader, + 'artist': a.artist, + 'track': a.track, + }) for video in re.finditer( r']+href=(["\'])(?P/video(?:-?[\d_]+).*?)\1', webpage): diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 8d671cca7..b9718901b 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -2,22 +2,29 @@ from __future__ import unicode_literals import re +import time +import itertools from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_str, +) from ..utils import ( dict_get, ExtractorError, float_or_none, int_or_none, remove_start, + try_get, + urlencode_postdata, ) -from ..compat import compat_urllib_parse_urlencode class VLiveIE(InfoExtractor): IE_NAME = 'vlive' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.vlive.tv/video/1326', 'md5': 'cc7314812855ce56de70a06a27314983', 'info_dict': { @@ -27,7 +34,20 @@ class VLiveIE(InfoExtractor): 'creator': "Girl's Day", 'view_count': int, }, - } + }, { + 'url': 'http://www.vlive.tv/video/16937', + 'info_dict': { + 'id': '16937', + 'ext': 'mp4', + 'title': '[V LIVE] 첸백시 걍방', + 'creator': 'EXO', + 'view_count': int, + 'subtitles': 'mincount:12', + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -35,17 +55,23 @@ class VLiveIE(InfoExtractor): webpage = self._download_webpage( 'http://www.vlive.tv/video/%s' % video_id, video_id) - video_params = self._search_regex( - r'\bvlive\.video\.init\(([^)]+)\)', - webpage, 'video params') - status, _, _, live_params, long_video_id, key = re.split( - r'"\s*,\s*"', video_params)[2:8] + VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' + VIDEO_PARAMS_FIELD = 'video params' + + params = self._parse_json(self._search_regex( + VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id, + transform_source=lambda s: '[' + s + ']', fatal=False) + + if not params or len(params) < 7: + params = self._search_regex( + VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD) + params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)] + + status, long_video_id, key = params[2], params[5], params[6] status = remove_start(status, 'PRODUCT_') if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR': - live_params = self._parse_json('"%s"' % live_params, video_id) - live_params = self._parse_json(live_params, video_id) - return self._live(video_id, webpage, live_params) + return self._live(video_id, webpage) elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO': if long_video_id and key: return self._replay(video_id, webpage, long_video_id, key) @@ -76,7 +102,22 @@ class VLiveIE(InfoExtractor): 'thumbnail': thumbnail, } - def _live(self, video_id, webpage, live_params): + def _live(self, video_id, webpage): + init_page = self._download_webpage( + 'http://www.vlive.tv/video/init/view', + video_id, note='Downloading live webpage', + data=urlencode_postdata({'videoSeq': video_id}), + headers={ + 'Referer': 'http://www.vlive.tv/video/%s' % video_id, + 'Content-Type': 'application/x-www-form-urlencoded' + }) + + live_params = self._search_regex( + r'"liveStreamInfo"\s*:\s*(".*"),', + init_page, 'live stream info') + live_params = self._parse_json(live_params, video_id) + live_params = self._parse_json(live_params, video_id) + formats = [] for vid in live_params.get('resolutions', []): formats.extend(self._extract_m3u8_formats( @@ -85,10 +126,14 @@ class VLiveIE(InfoExtractor): fatal=False, live=True)) self._sort_formats(formats) - return dict(self._get_common_fields(webpage), - id=video_id, - formats=formats, - is_live=True) + info = self._get_common_fields(webpage) + info.update({ + 'title': self._live_title(info['title']), + 'id': video_id, + 'formats': formats, + 'is_live': True, + }) + return info def _replay(self, video_id, webpage, long_video_id, key): playinfo = self._download_json( @@ -116,14 +161,103 @@ class VLiveIE(InfoExtractor): subtitles = {} for caption in playinfo.get('captions', {}).get('list', []): - lang = dict_get(caption, ('language', 'locale', 'country', 'label')) + lang = dict_get(caption, ('locale', 'language', 'country', 'label')) if lang and caption.get('source'): subtitles[lang] = [{ 'ext': 'vtt', 'url': caption['source']}] - return dict(self._get_common_fields(webpage), - id=video_id, - formats=formats, - view_count=view_count, - subtitles=subtitles) + info = self._get_common_fields(webpage) + info.update({ + 'id': video_id, + 'formats': formats, + 'view_count': view_count, + 'subtitles': subtitles, + }) + return info + + +class VLiveChannelIE(InfoExtractor): + IE_NAME = 'vlive:channel' + _VALID_URL = r'https?://channels\.vlive\.tv/(?P[0-9A-Z]+)' + _TEST = { + 'url': 'http://channels.vlive.tv/FCD4B', + 'info_dict': { + 'id': 'FCD4B', + 'title': 'MAMAMOO', + }, + 'playlist_mincount': 110 + } + _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + def _real_extract(self, url): + channel_code = self._match_id(url) + + webpage = self._download_webpage( + 'http://channels.vlive.tv/%s/video' % channel_code, channel_code) + + app_id = None + + app_js_url = self._search_regex( + r']+src=(["\'])(?Phttp.+?/app\.js.*?)\1', + webpage, 'app js', default=None, group='url') + + if app_js_url: + app_js = self._download_webpage( + app_js_url, channel_code, 'Downloading app JS', fatal=False) + if app_js: + app_id = self._search_regex( + r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', + app_js, 'app id', default=None) + + app_id = app_id or self._APP_ID + + channel_info = self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', + channel_code, note='Downloading decode channel code', + query={ + 'app_id': app_id, + 'channelCode': channel_code, + '_': int(time.time()) + }) + + channel_seq = channel_info['result']['channelSeq'] + channel_name = None + entries = [] + + for page_num in itertools.count(1): + video_list = self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', + channel_code, note='Downloading channel list page #%d' % page_num, + query={ + 'app_id': app_id, + 'channelSeq': channel_seq, + 'maxNumOfRows': 1000, + '_': int(time.time()), + 'pageNo': page_num + } + ) + + if not channel_name: + channel_name = try_get( + video_list, + lambda x: x['result']['channelInfo']['channelName'], + compat_str) + + videos = try_get( + video_list, lambda x: x['result']['videoList'], list) + if not videos: + break + + for video in videos: + video_id = video.get('videoSeq') + if not video_id: + continue + video_id = compat_str(video_id) + entries.append( + self.url_result( + 'http://www.vlive.tv/video/%s' % video_id, + ie=VLiveIE.ie_key(), video_id=video_id)) + + return self.playlist_result( + entries, channel_code, channel_name) diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index a938a4007..02c9617d2 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -20,7 +20,7 @@ class VodlockerIE(InfoExtractor): 'id': 'e8wvyzz4sl42', 'ext': 'mp4', 'title': 'Germany vs Brazil', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, }] @@ -31,7 +31,8 @@ class VodlockerIE(InfoExtractor): if any(p in webpage for p in ( '>THIS FILE WAS DELETED<', '>File Not Found<', - 'The file you were looking for could not be found, sorry for any inconvenience.<')): + 'The file you were looking for could not be found, sorry for any inconvenience.<', + '>The file was removed')): raise ExtractorError('Video %s does not exist' % video_id, expected=True) fields = self._hidden_inputs(webpage) diff --git a/youtube_dl/extractor/vodplatform.py b/youtube_dl/extractor/vodplatform.py index b49542b16..239644340 100644 --- a/youtube_dl/extractor/vodplatform.py +++ b/youtube_dl/extractor/vodplatform.py @@ -6,7 +6,7 @@ from ..utils import unescapeHTML class VODPlatformIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vod-platform\.net/embed/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?vod-platform\.net/[eE]mbed/(?P[^/?#]+)' _TEST = { # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar 'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw', @@ -25,29 +25,8 @@ class VODPlatformIE(InfoExtractor): title = unescapeHTML(self._og_search_title(webpage)) hidden_inputs = self._hidden_inputs(webpage) - base_url = self._search_regex( - '(.*/)(?:playlist.m3u8|manifest.mpd)', - hidden_inputs.get('HiddenmyhHlsLink') or hidden_inputs['HiddenmyDashLink'], - 'base url') - formats = self._extract_m3u8_formats( - base_url + 'playlist.m3u8', video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(self._extract_mpd_formats( - base_url + 'manifest.mpd', video_id, - mpd_id='dash', fatal=False)) - rtmp_formats = self._extract_smil_formats( - base_url + 'jwplayer.smil', video_id, fatal=False) - for rtmp_format in rtmp_formats: - rtsp_format = rtmp_format.copy() - rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) - del rtsp_format['play_path'] - del rtsp_format['ext'] - rtsp_format.update({ - 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), - 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), - 'protocol': 'rtsp', - }) - formats.extend([rtmp_format, rtsp_format]) + formats = self._extract_wowza_formats( + hidden_inputs.get('HiddenmyhHlsLink') or hidden_inputs['HiddenmyDashLink'], video_id, skip_protocols=['f4m', 'smil']) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 4f1a99a89..59e1359c4 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -26,7 +26,7 @@ class VoiceRepublicIE(InfoExtractor): 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', 'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.', - 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', 'duration': 1800, 'view_count': int, } diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py index b1b32ad44..f8e331493 100644 --- a/youtube_dl/extractor/voxmedia.py +++ b/youtube_dl/extractor/voxmedia.py @@ -9,13 +9,16 @@ class VoxMediaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:theverge|vox|sbnation|eater|polygon|curbed|racked)\.com/(?:[^/]+/)*(?P[^/?]+)' _TESTS = [{ 'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of', - 'md5': '73856edf3e89a711e70d5cf7cb280b37', 'info_dict': { 'id': '11eXZobjrG8DCSTgrNjVinU-YmmdYjhe', 'ext': 'mp4', 'title': 'Google\'s new material design direction', 'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'add_ie': ['Ooyala'], }, { # data-ooyala-id @@ -31,13 +34,16 @@ class VoxMediaIE(InfoExtractor): }, { # volume embed 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', - 'md5': '375c483c5080ab8cd85c9c84cfc2d1e4', 'info_dict': { 'id': 'wydzk3dDpmRz7PQoXRsTIX6XTkPjYL0b', 'ext': 'mp4', 'title': 'The new frontier of LGBTQ civil rights, explained', 'description': 'md5:0dc58e94a465cbe91d02950f770eb93f', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'add_ie': ['Ooyala'], }, { # youtube embed diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py index 1557a0e04..858ac9e71 100644 --- a/youtube_dl/extractor/vporn.py +++ b/youtube_dl/extractor/vporn.py @@ -7,6 +7,7 @@ from ..utils import ( ExtractorError, parse_duration, str_to_int, + urljoin, ) @@ -22,7 +23,7 @@ class VpornIE(InfoExtractor): 'ext': 'mp4', 'title': 'Violet on her 19th birthday', 'description': 'Violet dances in front of the camera which is sure to get you horny.', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'kileyGrope', 'categories': ['Masturbation', 'Teen'], 'duration': 393, @@ -40,7 +41,7 @@ class VpornIE(InfoExtractor): 'ext': 'mp4', 'title': 'Hana Shower', 'description': 'Hana showers at the bathroom.', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Hmmmmm', 'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female', '720p'], 'duration': 588, @@ -66,10 +67,9 @@ class VpornIE(InfoExtractor): description = self._html_search_regex( r'class="(?:descr|description_txt)">(.*?)', webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', fatal=False, default=None) - if thumbnail: - thumbnail = 'http://www.vporn.com' + thumbnail + thumbnail = urljoin('http://www.vporn.com', self._html_search_regex( + r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', + default=None)) uploader = self._html_search_regex( r'(?s)Uploaded by:.*?]*>(.+?)', diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index bec7ab327..00c72e346 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, float_or_none, ) @@ -75,7 +74,6 @@ class VRTIE(InfoExtractor): }, { 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', - 'md5': '', 'info_dict': { 'id': '2377055', 'ext': 'mp4', @@ -119,39 +117,17 @@ class VRTIE(InfoExtractor): video_id, 'mp4', m3u8_id='hls', fatal=False)) if src: - if determine_ext(src) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - src.replace('playlist.m3u8', 'manifest.f4m'), - video_id, f4m_id='hds', fatal=False)) - if 'data-video-geoblocking="true"' not in webpage: - rtmp_formats = self._extract_smil_formats( - src.replace('playlist.m3u8', 'jwplayer.smil'), - video_id, fatal=False) - formats.extend(rtmp_formats) - for rtmp_format in rtmp_formats: - rtmp_format_c = rtmp_format.copy() - rtmp_format_c['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) - del rtmp_format_c['play_path'] - del rtmp_format_c['ext'] - http_format = rtmp_format_c.copy() + formats = self._extract_wowza_formats(src, video_id) + if 'data-video-geoblocking="true"' not in webpage: + for f in formats: + if f['url'].startswith('rtsp://'): + http_format = f.copy() http_format.update({ - 'url': rtmp_format_c['url'].replace('rtmp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''), - 'format_id': rtmp_format['format_id'].replace('rtmp', 'http'), + 'url': f['url'].replace('rtsp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''), + 'format_id': f['format_id'].replace('rtsp', 'http'), 'protocol': 'http', }) - rtsp_format = rtmp_format_c.copy() - rtsp_format.update({ - 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), - 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), - 'protocol': 'rtsp', - }) - formats.extend([http_format, rtsp_format]) - else: - formats.extend(self._extract_f4m_formats( - '%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False)) + formats.append(http_format) if not formats and 'data-video-geoblocking="true"' in webpage: self.raise_geo_restricted('This video is only available in Belgium') diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index 10ca6acb1..8ce3a6b81 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -26,7 +26,7 @@ class VubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Best Drummer Ever [HD]', 'description': 'md5:2d63c4b277b85c2277761c2cf7337d71', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'uploader': 'William', 'timestamp': 1406876915, 'upload_date': '20140801', @@ -45,7 +45,7 @@ class VubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Chiara Grispo - Price Tag by Jessie J', 'description': 'md5:8ea652a1f36818352428cb5134933313', - 'thumbnail': 're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f\.jpg$', + 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f\.jpg$', 'uploader': 'Chiara.Grispo', 'timestamp': 1388743358, 'upload_date': '20140103', @@ -65,7 +65,7 @@ class VubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'My 7 year old Sister and I singing "Alive" by Krewella', 'description': 'md5:40bcacb97796339f1690642c21d56f4a', - 'thumbnail': 're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102265d5a9f-0f17-4f6b-5753-adf08484ee1e\.jpg$', + 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102265d5a9f-0f17-4f6b-5753-adf08484ee1e\.jpg$', 'uploader': 'Seraina', 'timestamp': 1396492438, 'upload_date': '20140403', @@ -84,7 +84,7 @@ class VubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Frozen - Let It Go Cover by Siren Gene', 'description': 'My rendition of "Let It Go" originally sung by Idina Menzel.', - 'thumbnail': 're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$', + 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$', 'uploader': 'Siren', 'timestamp': 1395448018, 'upload_date': '20140322', diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index b73da5cd0..55e087bdb 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -17,12 +17,12 @@ class VuClipIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P[0-9]+)' _TEST = { - 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', + 'url': 'http://m.vuclip.com/w?cid=1129900602&bu=8589892792&frm=w&z=34801&op=0&oc=843169247§ion=recommend', 'info_dict': { - 'id': '922692425', + 'id': '1129900602', 'ext': '3gp', - 'title': 'The Toy Soldiers - Hollywood Movie Trailer', - 'duration': 177, + 'title': 'Top 10 TV Convicts', + 'duration': 733, } } @@ -54,7 +54,7 @@ class VuClipIE(InfoExtractor): 'url': video_url, }] else: - formats = self._parse_html5_media_entries(url, webpage)[0]['formats'] + formats = self._parse_html5_media_entries(url, webpage, video_id)[0]['formats'] title = remove_end(self._html_search_regex( r'(.*?)-\s*Vuclip', webpage, 'title').strip(), ' - Video') diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py new file mode 100644 index 000000000..d44ec85fd --- /dev/null +++ b/youtube_dl/extractor/vvvvid.py @@ -0,0 +1,140 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, +) + + +class VVVVIDIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/#!(?:show|anime|film|series)/(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' + _TESTS = [{ + # video_type == 'video/vvvvid' + 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', + 'md5': 'b8d3cecc2e981adc3835adf07f6df91b', + 'info_dict': { + 'id': '489048', + 'ext': 'mp4', + 'title': 'Ping Pong', + }, + }, { + # video_type == 'video/rcs' + 'url': 'https://www.vvvvid.it/#!show/376/death-note-live-action/377/482493/episodio-01', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': '482493', + 'ext': 'mp4', + 'title': 'Episodio 01', + }, + }] + _conn_id = None + + def _real_initialize(self): + self._conn_id = self._download_json( + 'https://www.vvvvid.it/user/login', + None, headers=self.geo_verification_headers())['data']['conn_id'] + + def _real_extract(self, url): + show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + response = self._download_json( + 'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id), + video_id, headers=self.geo_verification_headers(), query={ + 'conn_id': self._conn_id, + }) + if response['result'] == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, response['message']), expected=True) + + vid = int(video_id) + video_data = list(filter( + lambda episode: episode.get('video_id') == vid, response['data']))[0] + formats = [] + + # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js + def ds(h): + g = "MNOPIJKL89+/4567UVWXQRSTEFGHABCDcdefYZabstuvopqr0123wxyzklmnghij" + + def f(m): + l = [] + o = 0 + b = False + m_len = len(m) + while ((not b) and o < m_len): + n = m[o] << 2 + o += 1 + k = -1 + j = -1 + if o < m_len: + n += m[o] >> 4 + o += 1 + if o < m_len: + k = (m[o - 1] << 4) & 255 + k += m[o] >> 2 + o += 1 + if o < m_len: + j = (m[o - 1] << 6) & 255 + j += m[o] + o += 1 + else: + b = True + else: + b = True + else: + b = True + l.append(n) + if k != -1: + l.append(k) + if j != -1: + l.append(j) + return l + + c = [] + for e in h: + c.append(g.index(e)) + + c_len = len(c) + for e in range(c_len * 2 - 1, -1, -1): + a = c[e % c_len] ^ c[(e + 1) % c_len] + c[e % c_len] = a + + c = f(c) + d = '' + for e in c: + d += chr(e) + + return d + + for quality in ('_sd', ''): + embed_code = video_data.get('embed_info' + quality) + if not embed_code: + continue + embed_code = ds(embed_code) + video_type = video_data.get('video_type') + if video_type in ('video/rcs', 'video/kenc'): + formats.extend(self._extract_akamai_formats( + embed_code, video_id)) + else: + formats.extend(self._extract_wowza_formats( + 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['title'], + 'formats': formats, + 'thumbnail': video_data.get('thumbnail'), + 'duration': int_or_none(video_data.get('length')), + 'series': video_data.get('show_title'), + 'season_id': season_id, + 'season_number': video_data.get('season_number'), + 'episode_id': str_or_none(video_data.get('id')), + 'epidode_number': int_or_none(video_data.get('number')), + 'episode_title': video_data['title'], + 'view_count': int_or_none(video_data.get('views')), + 'like_count': int_or_none(video_data.get('video_likes')), + } diff --git a/youtube_dl/extractor/vyborymos.py b/youtube_dl/extractor/vyborymos.py new file mode 100644 index 000000000..9e703c4b6 --- /dev/null +++ b/youtube_dl/extractor/vyborymos.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str + + +class VyboryMosIE(InfoExtractor): + _VALID_URL = r'https?://vybory\.mos\.ru/(?:#precinct/|account/channels\?.*?\bstation_id=)(?P\d+)' + _TESTS = [{ + 'url': 'http://vybory.mos.ru/#precinct/13636', + 'info_dict': { + 'id': '13636', + 'ext': 'mp4', + 'title': 're:^Участковая избирательная комиссия №2231 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Россия, Москва, улица Введенского, 32А', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://vybory.mos.ru/account/channels?station_id=13636', + 'only_matching': True, + }] + + def _real_extract(self, url): + station_id = self._match_id(url) + + channels = self._download_json( + 'http://vybory.mos.ru/account/channels?station_id=%s' % station_id, + station_id, 'Downloading channels JSON') + + formats = [] + for cam_num, (sid, hosts, name, _) in enumerate(channels, 1): + for num, host in enumerate(hosts, 1): + formats.append({ + 'url': 'http://%s/master.m3u8?sid=%s' % (host, sid), + 'ext': 'mp4', + 'format_id': 'camera%d-host%d' % (cam_num, num), + 'format_note': '%s, %s' % (name, host), + }) + + info = self._download_json( + 'http://vybory.mos.ru/json/voting_stations/%s/%s.json' + % (compat_str(station_id)[:3], station_id), + station_id, 'Downloading station JSON', fatal=False) + + return { + 'id': station_id, + 'title': self._live_title(info['name'] if info else station_id), + 'description': info.get('address'), + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py new file mode 100644 index 000000000..b270f08d1 --- /dev/null +++ b/youtube_dl/extractor/vzaar.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + float_or_none, +) + + +class VzaarIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P\d+)' + _TESTS = [{ + 'url': 'https://vzaar.com/videos/1152805', + 'md5': 'bde5ddfeb104a6c56a93a06b04901dbf', + 'info_dict': { + 'id': '1152805', + 'ext': 'mp4', + 'title': 'sample video (public)', + }, + }, { + 'url': 'https://view.vzaar.com/27272/player', + 'md5': '3b50012ac9bbce7f445550d54e0508f2', + 'info_dict': { + 'id': '27272', + 'ext': 'mp3', + 'title': 'MP3', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'http://view.vzaar.com/v2/%s/video' % video_id, video_id) + source_url = video_data['sourceUrl'] + + info = { + 'id': video_id, + 'title': video_data['videoTitle'], + 'url': source_url, + 'thumbnail': self._proto_relative_url(video_data.get('poster')), + 'duration': float_or_none(video_data.get('videoDuration')), + } + if 'audio' in source_url: + info.update({ + 'vcodec': 'none', + 'ext': 'mp3', + }) + else: + info.update({ + 'width': int_or_none(video_data.get('width')), + 'height': int_or_none(video_data.get('height')), + 'ext': 'mp4', + }) + return info diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index 8b9488340..cbb548672 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -20,7 +20,7 @@ class WallaIE(InfoExtractor): 'ext': 'flv', 'title': 'וואן דיירקשן: ההיסטריה', 'description': 'md5:de9e2512a92442574cdb0913c49bc4d8', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 3600, }, 'params': { diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 48fc438ed..20fef1f04 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -9,6 +9,7 @@ from ..utils import ( ExtractorError, unified_strdate, HEADRequest, + int_or_none, ) @@ -30,89 +31,136 @@ class WatIE(InfoExtractor): }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': 'fbc84e4378165278e743956d9c1bf16b', + 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', 'info_dict': { 'id': '11713075', 'ext': 'mp4', 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', - 'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3', 'upload_date': '20140816', - 'duration': 2910, }, - 'skip': "Ce contenu n'est pas disponible pour l'instant.", + 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], }, ] + _FORMATS = ( + (200, 416, 234), + (400, 480, 270), + (600, 640, 360), + (1200, 640, 360), + (1800, 960, 540), + (2500, 1280, 720), + ) + def _real_extract(self, url): video_id = self._match_id(url) video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them - video_info = self._download_json( - 'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media'] + video_data = self._download_json( + 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) + self.report_warning( + '%s returned error: %s' % (self.IE_NAME, error_desc)) chapters = video_info['chapters'] - first_chapter = chapters[0] + if chapters: + first_chapter = chapters[0] - def video_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] + def video_id_for_chapter(chapter): + return chapter['tc_start'].split('-')[0] - if video_id_for_chapter(first_chapter) != video_id: - self.to_screen('Multipart video detected') - entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] - return self.playlist_result(entries, video_id, video_info['title']) - # Otherwise we can continue and extract just one part, we have to use - # the video id for getting the video url + if video_id_for_chapter(first_chapter) != video_id: + self.to_screen('Multipart video detected') + entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] + return self.playlist_result(entries, video_id, video_info['title']) + # Otherwise we can continue and extract just one part, we have to use + # the video id for getting the video url + else: + first_chapter = video_info - date_diffusion = first_chapter.get('date_diffusion') - upload_date = unified_strdate(date_diffusion) if date_diffusion else None + title = first_chapter['title'] def extract_url(path_template, url_type): req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) - head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type) - red_url = head.geturl() - if req_url == red_url: - raise ExtractorError( - '%s said: Sorry, this video is not available from your country.' % self.IE_NAME, - expected=True) - return red_url + head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False) + if head: + red_url = head.geturl() + if req_url != red_url: + return red_url + return None - m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8') - http_url = extract_url('android5/%s.mp4', 'http') + def remove_bitrate_limit(manifest_url): + return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url) formats = [] - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - formats.extend(m3u8_formats) - formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - for m3u8_format in m3u8_formats: - vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') - if not vbr or not abr: - continue - f = m3u8_format.copy() - f.update({ - 'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - self._sort_formats(formats) + try: + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id) + m3u8_url = manifest_urls.get('hls') + if m3u8_url: + m3u8_url = remove_bitrate_limit(m3u8_url) + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + formats.extend(self._extract_f4m_formats( + m3u8_url.replace('ios', 'web').replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + http_url = extract_url('android5/%s.mp4', 'http') + if http_url: + for m3u8_format in m3u8_formats: + vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') + if not vbr or not abr: + continue + format_id = m3u8_format['format_id'].replace('hls', 'http') + fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) + if self._is_valid_url(fmt_url, video_id, format_id): + f = m3u8_format.copy() + f.update({ + 'url': fmt_url, + 'format_id': format_id, + 'protocol': 'http', + }) + formats.append(f) + mpd_url = manifest_urls.get('mpd') + if mpd_url: + formats.extend(self._extract_mpd_formats(remove_bitrate_limit( + mpd_url), video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + except ExtractorError: + abr = 64 + for vbr, width, height in self._FORMATS: + tbr = vbr + abr + format_id = 'http-%s' % tbr + fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) + if self._is_valid_url(fmt_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': fmt_url, + 'vbr': vbr, + 'abr': abr, + 'width': width, + 'height': height, + }) + + date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') + upload_date = unified_strdate(date_diffusion) if date_diffusion else None + duration = None + files = video_info['files'] + if files: + duration = int_or_none(files[0].get('duration')) return { 'id': video_id, - 'title': first_chapter['title'], - 'thumbnail': first_chapter['preview'], - 'description': first_chapter['description'], - 'view_count': video_info['views'], + 'title': title, + 'thumbnail': first_chapter.get('preview'), + 'description': first_chapter.get('description'), + 'view_count': int_or_none(video_info.get('views')), 'upload_date': upload_date, - 'duration': video_info['files'][0]['duration'], + 'duration': duration, 'formats': formats, } diff --git a/youtube_dl/extractor/watchindianporn.py b/youtube_dl/extractor/watchindianporn.py index 5d3b5bdb4..ed099beea 100644 --- a/youtube_dl/extractor/watchindianporn.py +++ b/youtube_dl/extractor/watchindianporn.py @@ -22,7 +22,7 @@ class WatchIndianPornIE(InfoExtractor): 'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera', 'ext': 'mp4', 'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'LoveJay', 'upload_date': '20160428', 'duration': 226, diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 390f9e830..f7e6360a3 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/webcaster.py b/youtube_dl/extractor/webcaster.py new file mode 100644 index 000000000..e4b65f54f --- /dev/null +++ b/youtube_dl/extractor/webcaster.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + xpath_text, +) + + +class WebcasterIE(InfoExtractor): + _VALID_URL = r'https?://bl\.webcaster\.pro/(?:quote|media)/start/free_(?P[^/]+)' + _TESTS = [{ + # http://video.khl.ru/quotes/393859 + 'url': 'http://bl.webcaster.pro/quote/start/free_c8cefd240aa593681c8d068cff59f407_hd/q393859/eb173f99dd5f558674dae55f4ba6806d/1480289104?sr%3D105%26fa%3D1%26type_id%3D18', + 'md5': '0c162f67443f30916ff1c89425dcd4cd', + 'info_dict': { + 'id': 'c8cefd240aa593681c8d068cff59f407_hd', + 'ext': 'mp4', + 'title': 'Сибирь - Нефтехимик. Лучшие моменты первого периода', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + 'url': 'http://bl.webcaster.pro/media/start/free_6246c7a4453ac4c42b4398f840d13100_hd/2_2991109016/e8d0d82587ef435480118f9f9c41db41/4635726126', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_xml(url, video_id) + + title = xpath_text(video, './/event_name', 'event name', fatal=True) + + def make_id(parts, separator): + return separator.join(filter(None, parts)) + + formats = [] + for format_id in (None, 'noise'): + track_tag = make_id(('track', format_id), '_') + for track in video.findall('.//iphone/%s' % track_tag): + track_url = track.text + if not track_url: + continue + if determine_ext(track_url) == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + track_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id=make_id(('hls', format_id), '-'), fatal=False) + for f in m3u8_formats: + f.update({ + 'source_preference': 0 if format_id == 'noise' else 1, + 'format_note': track.get('title'), + }) + formats.extend(m3u8_formats) + self._sort_formats(formats) + + thumbnail = xpath_text(video, './/image', 'thumbnail') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + + +class WebcasterFeedIE(InfoExtractor): + _VALID_URL = r'https?://bl\.webcaster\.pro/feed/start/free_(?P[^/]+)' + _TEST = { + 'url': 'http://bl.webcaster.pro/feed/start/free_c8cefd240aa593681c8d068cff59f407_hd/q393859/eb173f99dd5f558674dae55f4ba6806d/1480289104', + 'only_matching': True, + } + + @staticmethod + def _extract_url(ie, webpage): + mobj = re.search( + r'<(?:object|a[^>]+class=["\']webcaster-player["\'])[^>]+data(?:-config)?=(["\']).*?config=(?Phttps?://bl\.webcaster\.pro/feed/start/free_.*?)(?:[?&]|\1)', + webpage) + if mobj: + return mobj.group('url') + for secure in (True, False): + video_url = ie._og_search_video_url( + webpage, secure=secure, default=None) + if video_url: + mobj = re.search( + r'config=(?Phttps?://bl\.webcaster\.pro/feed/start/free_[^?&=]+)', + video_url) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + feed = self._download_xml(url, video_id) + + video_url = xpath_text( + feed, ('video_hd', 'video'), 'video url', fatal=True) + + return self.url_result(video_url, WebcasterIE.ie_key()) diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py index 7aea47ed5..1eb1f6702 100644 --- a/youtube_dl/extractor/webofstories.py +++ b/youtube_dl/extractor/webofstories.py @@ -19,7 +19,7 @@ class WebOfStoriesIE(InfoExtractor): 'id': '4536', 'ext': 'mp4', 'title': 'The temperature of the sun', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'Hans Bethe talks about calculating the temperature of the sun', 'duration': 238, } @@ -30,7 +30,7 @@ class WebOfStoriesIE(InfoExtractor): 'id': '55908', 'ext': 'mp4', 'title': 'The story of Gemmata obscuriglobus', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', 'duration': 169, }, @@ -42,7 +42,7 @@ class WebOfStoriesIE(InfoExtractor): 'id': '54215', 'ext': 'mp4', 'title': '"A Leg to Stand On"', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'Oliver Sacks talks about the death and resurrection of a limb', 'duration': 97, }, @@ -134,7 +134,7 @@ class WebOfStoriesPlaylistIE(InfoExtractor): entries = [ self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories') - for video_number in set(re.findall('href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage)) + for video_number in set(re.findall(r'href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage)) ] title = self._search_regex( diff --git a/youtube_dl/extractor/weiqitv.py b/youtube_dl/extractor/weiqitv.py index 3dafbeec2..7e0befd39 100644 --- a/youtube_dl/extractor/weiqitv.py +++ b/youtube_dl/extractor/weiqitv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class WeiqiTVIE(InfoExtractor): IE_DESC = 'WQTV' - _VALID_URL = r'https?://www\.weiqitv\.com/index/video_play\?videoId=(?P[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?weiqitv\.com/index/video_play\?videoId=(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3', @@ -37,11 +37,11 @@ class WeiqiTVIE(InfoExtractor): page = self._download_webpage(url, media_id) info_json_str = self._search_regex( - 'var\s+video\s*=\s*(.+});', page, 'info json str') + r'var\s+video\s*=\s*(.+});', page, 'info json str') info_json = self._parse_json(info_json_str, media_id) letvcloud_url = self._search_regex( - 'var\s+letvurl\s*=\s*"([^"]+)', page, 'letvcloud url') + r'var\s+letvurl\s*=\s*"([^"]+)', page, 'letvcloud url') return { '_type': 'url_transparent', diff --git a/youtube_dl/extractor/wrzuta.py b/youtube_dl/extractor/wrzuta.py index bdd7097ba..0f53f1bcb 100644 --- a/youtube_dl/extractor/wrzuta.py +++ b/youtube_dl/extractor/wrzuta.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index a83e68b17..deb7483ae 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py index e4a2baad2..4c41e98b2 100644 --- a/youtube_dl/extractor/xbef.py +++ b/youtube_dl/extractor/xbef.py @@ -14,7 +14,7 @@ class XBefIE(InfoExtractor): 'ext': 'mp4', 'title': 'md5:7358a9faef8b7b57acda7c04816f170e', 'age_limit': 18, - 'thumbnail': 're:^http://.*\.jpg', + 'thumbnail': r're:^http://.*\.jpg', } } diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py index b113ab1c4..d9c277bc3 100644 --- a/youtube_dl/extractor/xboxclips.py +++ b/youtube_dl/extractor/xboxclips.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 995aada0d..e616adce3 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -44,7 +44,7 @@ class XFileShareIE(InfoExtractor): 'id': '06y9juieqpmi', 'ext': 'mp4', 'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, }, { 'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html', @@ -56,7 +56,7 @@ class XFileShareIE(InfoExtractor): 'id': '3rso4kdn6f9m', 'ext': 'mp4', 'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', } }, { 'url': 'http://movpod.in/0wguyyxi1yca', @@ -67,7 +67,7 @@ class XFileShareIE(InfoExtractor): 'id': '3ivfabn7573c', 'ext': 'mp4', 'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4', - 'thumbnail': 're:http://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', }, 'skip': 'Video removed', }, { @@ -124,12 +124,14 @@ class XFileShareIE(InfoExtractor): webpage = self._download_webpage(req, video_id, 'Downloading video page') title = (self._search_regex( - [r'style="z-index: [0-9]+;">([^<]+)', + (r'style="z-index: [0-9]+;">([^<]+)', r'([^<]+)', r'h4-fine[^>]*>([^<]+)<', r'>Watch (.+) ', - r'

([^<]+)

'], - webpage, 'title', default=None) or self._og_search_title(webpage)).strip() + r'

([^<]+)

', + r'

]*>([^<]+)<'), # streamin.to + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or video_id).strip() def extract_video_url(default=NO_DEFAULT): return self._search_regex( diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index bd8e1af2e..36a8c9840 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -5,8 +5,8 @@ import re from .common import InfoExtractor from ..utils import ( dict_get, - float_or_none, int_or_none, + parse_duration, unified_strdate, ) @@ -22,7 +22,7 @@ class XHamsterIE(InfoExtractor): 'title': 'FemaleAgent Shy beauty takes the bait', 'upload_date': '20121014', 'uploader': 'Ruseful2011', - 'duration': 893.52, + 'duration': 893, 'age_limit': 18, }, }, { @@ -33,7 +33,7 @@ class XHamsterIE(InfoExtractor): 'title': 'Britney Spears Sexy Booty', 'upload_date': '20130914', 'uploader': 'jojo747400', - 'duration': 200.48, + 'duration': 200, 'age_limit': 18, }, 'params': { @@ -48,7 +48,7 @@ class XHamsterIE(InfoExtractor): 'title': '....', 'upload_date': '20160208', 'uploader': 'parejafree', - 'duration': 72.0, + 'duration': 72, 'age_limit': 18, }, 'params': { @@ -101,9 +101,9 @@ class XHamsterIE(InfoExtractor): r''']+poster=(?P["'])(?P.+?)(?P=q)[^>]*>'''], webpage, 'thumbnail', fatal=False, group='thumbnail') - duration = float_or_none(self._search_regex( - r'(["\'])duration\1\s*:\s*(["\'])(?P.+?)\2', - webpage, 'duration', fatal=False, group='duration')) + duration = parse_duration(self._search_regex( + r'Runtime:\s*\s*([\d:]+)', webpage, + 'duration', fatal=False)) view_count = int_or_none(self._search_regex( r'content=["\']User(?:View|Play)s:(\d+)', diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index a6dfc4af9..d017e03de 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -13,9 +13,12 @@ class XiamiBaseIE(InfoExtractor): webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs) if '>Xiami is currently not available in your country.<' in webpage: self.raise_geo_restricted('Xiami is currently not available in your country') + return webpage def _extract_track(self, track, track_id=None): - title = track['title'] + track_name = track.get('songName') or track.get('name') or track['subName'] + artist = track.get('artist') or track.get('artist_name') or track.get('singers') + title = '%s - %s' % (artist, track_name) if artist else track_name track_url = self._decrypt(track['location']) subtitles = {} @@ -30,9 +33,10 @@ class XiamiBaseIE(InfoExtractor): 'thumbnail': track.get('pic') or track.get('album_pic'), 'duration': int_or_none(track.get('length')), 'creator': track.get('artist', '').split(';')[0], - 'track': title, - 'album': track.get('album_name'), - 'artist': track.get('artist'), + 'track': track_name, + 'track_number': int_or_none(track.get('track')), + 'album': track.get('album_name') or track.get('title'), + 'artist': artist, 'subtitles': subtitles, } @@ -67,14 +71,14 @@ class XiamiBaseIE(InfoExtractor): class XiamiSongIE(XiamiBaseIE): IE_NAME = 'xiami:song' IE_DESC = '虾米音乐' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.xiami.com/song/1775610518', 'md5': '521dd6bea40fd5c9c69f913c232cb57e', 'info_dict': { 'id': '1775610518', 'ext': 'mp3', - 'title': 'Woman', + 'title': 'HONNE - Woman', 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', 'duration': 265, 'creator': 'HONNE', @@ -94,7 +98,7 @@ class XiamiSongIE(XiamiBaseIE): 'info_dict': { 'id': '1775256504', 'ext': 'mp3', - 'title': '悟空', + 'title': '戴荃 - 悟空', 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', 'duration': 200, 'creator': '戴荃', @@ -108,6 +112,26 @@ class XiamiSongIE(XiamiBaseIE): }, }, 'skip': 'Georestricted', + }, { + 'url': 'http://www.xiami.com/song/1775953850', + 'info_dict': { + 'id': '1775953850', + 'ext': 'mp3', + 'title': 'До Скону - Чума Пожирает Землю', + 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', + 'duration': 683, + 'creator': 'До Скону', + 'track': 'Чума Пожирает Землю', + 'track_number': 7, + 'album': 'Ад', + 'artist': 'До Скону', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.xiami.com/song/xLHGwgd07a1', + 'only_matching': True, }] def _real_extract(self, url): @@ -123,7 +147,7 @@ class XiamiPlaylistBaseIE(XiamiBaseIE): class XiamiAlbumIE(XiamiPlaylistBaseIE): IE_NAME = 'xiami:album' IE_DESC = '虾米音乐 - 专辑' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P[^/?#&]+)' _TYPE = '1' _TESTS = [{ 'url': 'http://www.xiami.com/album/2100300444', @@ -135,28 +159,34 @@ class XiamiAlbumIE(XiamiPlaylistBaseIE): }, { 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9', 'only_matching': True, + }, { + 'url': 'http://www.xiami.com/album/URVDji2a506', + 'only_matching': True, }] class XiamiArtistIE(XiamiPlaylistBaseIE): IE_NAME = 'xiami:artist' IE_DESC = '虾米音乐 - 歌手' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P[^/?#&]+)' _TYPE = '2' - _TEST = { + _TESTS = [{ 'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp', 'info_dict': { 'id': '2132', }, 'playlist_count': 20, 'skip': 'Georestricted', - } + }, { + 'url': 'http://www.xiami.com/artist/bC5Tk2K6eb99', + 'only_matching': True, + }] class XiamiCollectionIE(XiamiPlaylistBaseIE): IE_NAME = 'xiami:collection' IE_DESC = '虾米音乐 - 精选集' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P[^/?#&]+)' _TYPE = '3' _TEST = { 'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr', diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index bcb140305..e0a6255dc 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index a66daee46..e0818201a 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import base64 @@ -24,7 +24,7 @@ class XuiteIE(InfoExtractor): 'id': '3860914', 'ext': 'mp3', 'title': '孤單南半球-歐德陽', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 247.246, 'timestamp': 1314932940, 'upload_date': '20110902', @@ -40,7 +40,7 @@ class XuiteIE(InfoExtractor): 'id': '25925099', 'ext': 'mp4', 'title': 'BigBuckBunny_320x180', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 596.458, 'timestamp': 1454242500, 'upload_date': '20160131', @@ -58,7 +58,7 @@ class XuiteIE(InfoExtractor): 'ext': 'mp4', 'title': '暗殺教室 02', 'description': '字幕:【極影字幕社】', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1384.907, 'timestamp': 1421481240, 'upload_date': '20150117', diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 1dfe031ca..30825daae 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -15,10 +15,10 @@ class XVideosIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?xvideos\.com/video(?P[0-9]+)(?:.*)' _TEST = { 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', - 'md5': '4b46ae6ea5e6e9086e714d883313c0c9', + 'md5': '14cea69fcb84db54293b1e971466c2e1', 'info_dict': { 'id': '4588838', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Biker Takes his Girl', 'age_limit': 18, } @@ -42,24 +42,24 @@ class XVideosIE(InfoExtractor): video_url = compat_urllib_parse_unquote(self._search_regex( r'flv_url=(.+?)&', webpage, 'video URL', default='')) if video_url: - formats.append({'url': video_url}) + formats.append({ + 'url': video_url, + 'format_id': 'flv', + }) - player_args = self._search_regex( - r'(?s)new\s+HTML5Player\((.+?)\)', webpage, ' html5 player', default=None) - if player_args: - for arg in player_args.split(','): - format_url = self._search_regex( - r'(["\'])(?Phttps?://.+?)\1', arg, 'url', - default=None, group='url') - if not format_url: - continue - ext = determine_ext(format_url) - if ext == 'mp4': - formats.append({'url': format_url}) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + for kind, _, format_url in re.findall( + r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage): + format_id = kind.lower() + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif format_id in ('urllow', 'urlhigh'): + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (determine_ext(format_url, 'mp4'), format_id[3:]), + 'quality': -2 if format_id.endswith('low') else None, + }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index b0679dfb7..4951414e9 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -8,7 +8,6 @@ import re from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_urllib_parse, - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -17,9 +16,13 @@ from ..utils import ( ExtractorError, int_or_none, mimetype2ext, + determine_ext, ) -from .brightcove import BrightcoveNewIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .nbc import NBCSportsVPlayerIE @@ -39,7 +42,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - 'md5': 'c3466d2b6d5dd6b9f41ba9ed04c24b23', + 'md5': '251af144a19ebc4a033e8ba91ac726bb', 'info_dict': { 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', 'ext': 'mp4', @@ -50,7 +53,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', - 'md5': '75ffabdb87c16d4ffe8c036dc4d1c136', + 'md5': '7993e572fac98e044588d0b5260f4352', 'info_dict': { 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', 'ext': 'mp4', @@ -61,7 +64,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html', - 'md5': '9035d38f88b1782682a3e89f985be5bb', + 'md5': '45c024bad51e63e9b6f6fad7a43a8c23', 'info_dict': { 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f', 'ext': 'mp4', @@ -72,10 +75,10 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', - 'md5': '0b51660361f0e27c9789e7037ef76f4b', + 'md5': '71298482f7c64cbb7fa064e4553ff1c1', 'info_dict': { 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', 'description': 'md5:f66c890e1490f4910a9953c941dee944', 'duration': 97, @@ -98,7 +101,7 @@ class YahooIE(InfoExtractor): 'id': '154609075', }, 'playlist': [{ - 'md5': 'f8e336c6b66f503282e5f719641d6565', + 'md5': '000887d0dc609bc3a47c974151a40fb8', 'info_dict': { 'id': 'e624c4bc-3389-34de-9dfc-025f74943409', 'ext': 'mp4', @@ -107,7 +110,7 @@ class YahooIE(InfoExtractor): 'duration': 30, }, }, { - 'md5': '958bcb90b4d6df71c56312137ee1cd5a', + 'md5': '81bc74faf10750fe36e4542f9a184c66', 'info_dict': { 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9', 'ext': 'mp4', @@ -139,7 +142,7 @@ class YahooIE(InfoExtractor): 'skip': 'Domain name in.lifestyle.yahoo.com gone', }, { 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', - 'md5': 'b17ac378b1134fa44370fb27db09a744', + 'md5': '2a9752f74cb898af5d1083ea9f661b58', 'info_dict': { 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', 'ext': 'mp4', @@ -168,7 +171,7 @@ class YahooIE(InfoExtractor): }, { # Query result is embedded in webpage, but explicit request to video API fails with geo restriction 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', - 'md5': '1ddbf7c850777548438e5c4f147c7b8c', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', 'info_dict': { 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', 'ext': 'mp4', @@ -196,6 +199,33 @@ class YahooIE(InfoExtractor): 'description': 'Galactic', 'title': 'Dolla Diva (feat. Maggie Koerner)', }, + 'skip': 'redirect to https://www.yahoo.com/music', + }, + { + # yahoo://article/ + 'url': 'https://www.yahoo.com/movies/video/true-story-trailer-173000497.html', + 'info_dict': { + 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', + 'ext': 'mp4', + 'title': "'True Story' Trailer", + 'description': 'True Story', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # ytwnews://cavideo/ + 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', + 'info_dict': { + 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', + 'ext': 'mp4', + 'title': '單車天使 - 中文版預', + 'description': '中文版預', + }, + 'params': { + 'skip_download': True, + }, }, ] @@ -213,15 +243,7 @@ class YahooIE(InfoExtractor): entries = [] iframe_urls = re.findall(r']+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) for idx, iframe_url in enumerate(iframe_urls): - iframepage = self._download_webpage( - host + iframe_url, display_id, - note='Downloading iframe webpage for video #%d' % idx) - items_json = self._search_regex( - r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None) - if items_json: - items = json.loads(items_json) - video_id = items[0]['id'] - entries.append(self._get_info(video_id, display_id, webpage)) + entries.append(self.url_result(host + iframe_url, 'Yahoo')) if entries: return self.playlist_result(entries, page_id) @@ -230,6 +252,11 @@ class YahooIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) + # Look for Brightcove Legacy Studio embeds + bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + if bc_url: + return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) + # Look for Brightcove New Studio embeds bc_url = BrightcoveNewIE._extract_url(webpage) if bc_url: @@ -246,7 +273,9 @@ class YahooIE(InfoExtractor): if config: sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') if sapi and 'query' in sapi: - return self._extract_info(display_id, sapi, webpage) + info = self._extract_info(display_id, sapi, webpage) + self._sort_formats(info['formats']) + return info items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, @@ -266,7 +295,8 @@ class YahooIE(InfoExtractor): r'"first_videoid"\s*:\s*"([^"]+)"', r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id), r']data-uuid=["\']([^"\']+)', - r'yahoo://article/view\?.*\buuid=([^&"\']+)', + r']+yahoo://article/view\?.*\buuid=([^&"\']+)', + r']+["\']ytwnews://cavideo/(?:[^/]+/)+([\da-fA-F-]+)[&"\']', ] video_id = self._search_regex( CONTENT_ID_REGEXES, webpage, 'content ID') @@ -292,15 +322,17 @@ class YahooIE(InfoExtractor): formats = [] for s in info['streams']: + tbr = int_or_none(s.get('bitrate')) format_info = { 'width': int_or_none(s.get('width')), 'height': int_or_none(s.get('height')), - 'tbr': int_or_none(s.get('bitrate')), + 'tbr': tbr, } host = s['host'] path = s['path'] if host.startswith('rtmp'): + fmt = 'rtmp' format_info.update({ 'url': host, 'play_path': path, @@ -308,14 +340,18 @@ class YahooIE(InfoExtractor): }) else: if s.get('format') == 'm3u8_playlist': - format_info['protocol'] = 'm3u8_native' - format_info['ext'] = 'mp4' + fmt = 'hls' + format_info.update({ + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + else: + fmt = format_info['ext'] = determine_ext(path) format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url + format_info['format_id'] = fmt + ('-%d' % tbr if tbr else '') formats.append(format_info) - self._sort_formats(formats) - closed_captions = self._html_search_regex( r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', default='[]') @@ -346,17 +382,25 @@ class YahooIE(InfoExtractor): def _get_info(self, video_id, display_id, webpage): region = self._search_regex( r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse_urlencode({ - 'protocol': 'http', - 'region': region.upper(), - }) - query_url = ( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' - '{id}?{data}'.format(id=video_id, data=data)) - query_result = self._download_json( - query_url, display_id, 'Downloading video info') - return self._extract_info(display_id, query_result, webpage) + webpage, 'region', fatal=False, default='US').upper() + formats = [] + info = {} + for fmt in ('webm', 'mp4'): + query_result = self._download_json( + 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + video_id, + display_id, 'Downloading %s video info' % fmt, query={ + 'protocol': 'http', + 'region': region, + 'format': fmt, + }) + info = self._extract_info(display_id, query_result, webpage) + formats.extend(info['formats']) + formats.extend(self._extract_m3u8_formats( + 'http://video.media.yql.yahoo.com/v1/hls/%s?region=%s' % (video_id, region), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + info['formats'] = formats + return info class YahooSearchIE(SearchInfoExtractor): diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py index 63bbc0634..ef5535547 100644 --- a/youtube_dl/extractor/yam.py +++ b/youtube_dl/extractor/yam.py @@ -15,7 +15,7 @@ from ..utils import ( class YamIE(InfoExtractor): IE_DESC = '蕃薯藤yam天空部落' - _VALID_URL = r'https?://mymedia.yam.com/m/(?P\d+)' + _VALID_URL = r'https?://mymedia\.yam\.com/m/(?P\d+)' _TESTS = [{ # An audio hosted on Yam diff --git a/youtube_dl/extractor/yesjapan.py b/youtube_dl/extractor/yesjapan.py index 112a6c030..681338c96 100644 --- a/youtube_dl/extractor/yesjapan.py +++ b/youtube_dl/extractor/yesjapan.py @@ -21,7 +21,7 @@ class YesJapanIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1416391590, 'upload_date': '20141119', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', } } diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py index 834d860af..1fd8d35c6 100644 --- a/youtube_dl/extractor/yinyuetai.py +++ b/youtube_dl/extractor/yinyuetai.py @@ -18,7 +18,7 @@ class YinYueTaiIE(InfoExtractor): 'title': '少女时代_PARTY_Music Video Teaser', 'creator': '少女时代', 'duration': 25, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { 'url': 'http://v.yinyuetai.com/video/h5/2322376', diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 0d943c343..c4ae4d88e 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -17,7 +17,7 @@ class YnetIE(InfoExtractor): 'id': 'L-11659-99244', 'ext': 'flv', 'title': 'איש לא יודע מאיפה באנו', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', } }, { 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', @@ -25,7 +25,7 @@ class YnetIE(InfoExtractor): 'id': 'L-8859-84418', 'ext': 'flv', 'title': "צפו: הנשיקה הלוהטת של תורגי' ויוליה פלוטקין", - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', } } ] diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 31e2f9263..b50f34e9b 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,21 +1,16 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class YouJizzIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P[0-9]+)\.html(?:$|[?#])' _TESTS = [{ 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', - 'md5': '07e15fa469ba384c7693fd246905547c', + 'md5': '78fc1901148284c69af12640e01c6310', 'info_dict': { 'id': '2189178', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Zeichentrick 1', 'age_limit': 18, } @@ -27,38 +22,18 @@ class YouJizzIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + # YouJizz's HTML5 player has invalid HTML + webpage = webpage.replace('"controls', '" controls') age_limit = self._rta_search(webpage) video_title = self._html_search_regex( r'\s*(.*)\s*', webpage, 'title') - embed_page_url = self._search_regex( - r'(https?://www.youjizz.com/videos/embed/[0-9]+)', - webpage, 'embed page') - webpage = self._download_webpage( - embed_page_url, video_id, note='downloading embed page') + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - # Get the video URL - m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P.+?)"\);', webpage) - if m_playlist is not None: - playlist_url = m_playlist.group('playlist') - playlist_page = self._download_webpage(playlist_url, video_id, - 'Downloading playlist page') - m_levels = list(re.finditer(r'[^"]+)"\)\);', - webpage, 'video URL') - - return { + info_dict.update({ 'id': video_id, - 'url': video_url, 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'player_url': embed_page_url, 'age_limit': age_limit, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 0df2d76ee..34ab878a4 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -24,7 +24,7 @@ class YouPornIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Ask Dan And Jennifer', 'upload_date': '20101221', 'average_rating': int, @@ -35,7 +35,7 @@ class YouPornIE(InfoExtractor): 'age_limit': 18, }, }, { - # Anonymous User uploader + # Unknown uploader 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', 'info_dict': { 'id': '561726', @@ -43,8 +43,8 @@ class YouPornIE(InfoExtractor): 'ext': 'mp4', 'title': 'Big Tits Awesome Brunette On amazing webcam show', 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Anonymous User', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Unknown', 'upload_date': '20111125', 'average_rating': int, 'view_count': int, @@ -140,17 +140,17 @@ class YouPornIE(InfoExtractor): r'>All [Cc]omments? \(([\d,.]+)\)', webpage, 'comment count', fatal=False)) - def extract_tag_box(title): - tag_box = self._search_regex( - (r']+class=["\']tagBoxTitle["\'][^>]*>\s*%s\b.*?\s*' - ']+class=["\']tagBoxContent["\']>(.+?)') % re.escape(title), - webpage, '%s tag box' % title, default=None) + def extract_tag_box(regex, title): + tag_box = self._search_regex(regex, webpage, title, default=None) if not tag_box: return [] return re.findall(r']+href=[^>]+>([^<]+)', tag_box) - categories = extract_tag_box('Category') - tags = extract_tag_box('Tags') + categories = extract_tag_box( + r'(?s)Categories:.*?]+>(.+?)', 'categories') + tags = extract_tag_box( + r'(?s)Tags:.*?\s*]+class=["\']tagBoxContent["\'][^>]*>(.+?)', + 'tags') return { 'id': video_id, diff --git a/youtube_dl/extractor/yourupload.py b/youtube_dl/extractor/yourupload.py index 4e25d6f22..9fa772838 100644 --- a/youtube_dl/extractor/yourupload.py +++ b/youtube_dl/extractor/yourupload.py @@ -2,44 +2,37 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import urljoin class YourUploadIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?:yourupload\.com/watch| - embed\.yourupload\.com| - embed\.yucache\.net - )/(?P[A-Za-z0-9]+) - ''' - _TESTS = [ - { - 'url': 'http://yourupload.com/watch/14i14h', - 'md5': '5e2c63385454c557f97c4c4131a393cd', - 'info_dict': { - 'id': '14i14h', - 'ext': 'mp4', - 'title': 'BigBuckBunny_320x180.mp4', - 'thumbnail': 're:^https?://.*\.jpe?g', - } - }, - { - 'url': 'http://embed.yourupload.com/14i14h', - 'only_matching': True, - }, - { - 'url': 'http://embed.yucache.net/14i14h?client_file_id=803349', - 'only_matching': True, - }, - ] + _VALID_URL = r'https?://(?:www\.)?(?:yourupload\.com/(?:watch|embed)|embed\.yourupload\.com)/(?P[A-Za-z0-9]+)' + _TESTS = [{ + 'url': 'http://yourupload.com/watch/14i14h', + 'md5': '5e2c63385454c557f97c4c4131a393cd', + 'info_dict': { + 'id': '14i14h', + 'ext': 'mp4', + 'title': 'BigBuckBunny_320x180.mp4', + 'thumbnail': r're:^https?://.*\.jpe?g', + } + }, { + 'url': 'http://www.yourupload.com/embed/14i14h', + 'only_matching': True, + }, { + 'url': 'http://embed.yourupload.com/14i14h', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - embed_url = 'http://embed.yucache.net/{0:}'.format(video_id) + embed_url = 'http://www.yourupload.com/embed/%s' % video_id + webpage = self._download_webpage(embed_url, video_id) title = self._og_search_title(webpage) - video_url = self._og_search_video_url(webpage) + video_url = urljoin(embed_url, self._og_search_video_url(webpage)) thumbnail = self._og_search_thumbnail(webpage, default=None) return { diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 268080ba6..76710931a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -40,6 +40,7 @@ from ..utils import ( sanitized_Request, smuggle_url, str_to_int, + try_get, unescapeHTML, unified_strdate, unsmuggle_url, @@ -91,36 +92,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if login_page is False: return - galx = self._search_regex(r'(?s) you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?&list=) # combined list/video URLs are handled by the playlist IE + (?!.*?\blist=) # combined list/video URLs are handled by the playlist IE (?(1).+)? # if we found the ID, everything can follow $""" _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' @@ -334,6 +317,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, @@ -345,6 +329,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'preference': -50, 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'preference': -50, 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, @@ -387,19 +373,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube' _TESTS = [ { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], + 'duration': 10, 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -407,7 +394,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } }, { - 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY', + 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY', 'note': 'Test generic use_cipher_signature video (#897)', 'info_dict': { 'id': 'UxxajLWwzqY', @@ -419,9 +406,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], + 'duration': 180, 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', 'license': 'Standard YouTube License', 'creator': 'Icona Pop', } @@ -436,9 +424,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', 'alt_title': 'Tunnel Vision', 'description': 'md5:64249768eec3bc4276236606ea996373', + 'duration': 419, 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', 'age_limit': 18, @@ -455,13 +444,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', 'uploader': 'SET India', 'uploader_id': 'setindia', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', 'license': 'Standard YouTube License', 'age_limit': 18, } }, { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', 'note': 'Use the first video ID in the URL', 'info_dict': { 'id': 'BaW_jenozKc', @@ -469,12 +458,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], + 'duration': 10, 'like_count': int, 'dislike_count': int, }, @@ -483,14 +473,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, { - 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I', + 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', 'note': '256k DASH audio (format 141) via DASH manifest', 'info_dict': { 'id': 'a9LDPn-MO4I', 'ext': 'm4a', 'upload_date': '20121002', 'uploader_id': '8KVIDEO', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', 'uploader': '8KVIDEO', 'license': 'Standard YouTube License', @@ -510,6 +500,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'm4a', 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', + 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', @@ -529,6 +520,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Taylor Swift - Shake It Off', 'alt_title': 'Shake It Off', 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', + 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', @@ -546,10 +538,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'T4XJQO3qol8', 'ext': 'mp4', + 'duration': 219, 'upload_date': '20100909', 'uploader': 'The Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'license': 'Standard YouTube License', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', @@ -557,15 +550,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, # Normal age-gate video (No vevo, embed allowed) { - 'url': 'http://youtube.com/watch?v=HtVdAasjOgU', + 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', 'info_dict': { 'id': 'HtVdAasjOgU', 'ext': 'mp4', 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', + 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', + 'duration': 142, 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/WitcherGame', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', 'license': 'Standard YouTube License', 'age_limit': 18, @@ -573,15 +567,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, # Age-gate video with encrypted signature { - 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU', + 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', 'info_dict': { 'id': '6kLq3WMV1nU', 'ext': 'mp4', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', + 'duration': 247, 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', 'upload_date': '20110629', 'license': 'Standard YouTube License', 'age_limit': 18, @@ -593,9 +588,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': '__2ABJjxzNo', 'ext': 'mp4', + 'duration': 266, 'upload_date': '20100430', 'uploader_id': 'deadmau5', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/deadmau5', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', 'creator': 'deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', @@ -613,9 +609,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'lqQg6PlCWgI', 'ext': 'mp4', + 'duration': 6085, 'upload_date': '20150827', 'uploader_id': 'olympic', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', 'license': 'Standard YouTube License', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympic', @@ -632,9 +629,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': '_b-2C3KPAM0', 'ext': 'mp4', 'stretched_ratio': 16 / 9., + 'duration': 85, 'upload_date': '20110310', 'uploader_id': 'AllenMeow', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/AllenMeow', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', 'uploader': '孫艾倫', 'license': 'Standard YouTube License', @@ -666,9 +664,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'duration': 220, 'upload_date': '20150625', 'uploader_id': 'dorappi2000', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'license': 'Standard YouTube License', 'formats': 'mincount:32', @@ -708,10 +707,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7335, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', 'license': 'Standard YouTube License', }, }, { @@ -720,10 +720,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7337, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', 'license': 'Standard YouTube License', }, }, { @@ -732,10 +733,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7337, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', 'license': 'Standard YouTube License', }, }, { @@ -744,10 +746,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (zim)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7334, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', 'license': 'Standard YouTube License', }, }], @@ -766,11 +769,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip': 'Not multifeed anymore', }, { - 'url': 'http://vid.plus/FlRa-iH7PGw', + 'url': 'https://vid.plus/FlRa-iH7PGw', 'only_matching': True, }, { - 'url': 'http://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', + 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', 'only_matching': True, }, { @@ -785,9 +788,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', 'alt_title': 'Dark Walk', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', + 'duration': 133, 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', 'license': 'Standard YouTube License', 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', @@ -826,10 +830,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'md5:e41008789470fc2533a3252216f1c1d1', 'description': 'md5:a677553cf0840649b731a3024aeff4cc', + 'duration': 721, 'upload_date': '20150127', 'uploader_id': 'BerkmanCenter', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', - 'uploader': 'BerkmanCenter', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', + 'uploader': 'The Berkman Klein Center for Internet & Society', 'license': 'Creative Commons Attribution license (reuse allowed)', }, 'params': { @@ -844,10 +849,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', 'description': 'md5:dda0d780d5a6e120758d1711d062a867', + 'duration': 4060, 'upload_date': '20151119', 'uploader': 'Bernie 2016', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'license': 'Creative Commons Attribution license (reuse allowed)', }, 'params': { @@ -862,6 +868,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059) 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', 'only_matching': True, + }, + { + # Rental video preview + 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg', + 'info_dict': { + 'id': 'uGpuVWrhIzE', + 'ext': 'mp4', + 'title': 'Piku - Trailer', + 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', + 'upload_date': '20150811', + 'uploader': 'FlixMatrix', + 'uploader_id': 'FlixMatrixKaravan', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', + 'license': 'Standard YouTube License', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # YouTube Red video with episode data + 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4', + 'info_dict': { + 'id': 'iqKdEhx-dD4', + 'ext': 'mp4', + 'title': 'Isolation - Mind Field (Ep 1)', + 'description': 'md5:8013b7ddea787342608f63a13ddc9492', + 'duration': 2085, + 'upload_date': '20170118', + 'uploader': 'Vsauce', + 'uploader_id': 'Vsauce', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', + 'license': 'Standard YouTube License', + 'series': 'Mind Field', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': [ + 'Skipping DASH manifest', + ], + }, + { + # itag 212 + 'url': '1t24XAntNCY', + 'only_matching': True, } ] @@ -976,8 +1030,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode, - 'Initial JS player signature function name') + (r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\.sig\|\|(?P[a-zA-Z0-9$]+)\('), + jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) initial_function = jsi.extract_function(funcname) @@ -998,6 +1053,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url.startswith('//'): player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: @@ -1272,6 +1330,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) + # Rental video is not rented but preview is available (e.g. + # https://www.youtube.com/watch?v=yYr8q0y5Jfg, + # https://github.com/rg3/youtube-dl/issues/10532) + if not video_info and args.get('ypc_vid'): + return self.url_result( + args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): @@ -1442,6 +1506,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_alt_title = video_creator = None + m_episode = re.search( + r']+id="watch7-headline"[^>]*>\s*]*>.*?>(?P[^<]+)\s*S(?P\d+)\s*•\s*E(?P\d+)', + video_webpage) + if m_episode: + series = m_episode.group('series') + season_number = int(m_episode.group('season')) + episode_number = int(m_episode.group('episode')) + else: + series = season_number = episode_number = None + m_cat_container = self._search_regex( r'(?s)]*>\s*Category\s*

\s*]*>(.*?)', video_webpage, 'categories', default=None) @@ -1470,11 +1544,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_subtitles = self.extract_subtitles(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - if 'length_seconds' not in video_info: - self._downloader.report_warning('unable to extract video duration') - video_duration = None - else: - video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0])) + video_duration = try_get( + video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = parse_duration(self._html_search_meta( + 'duration', video_webpage, 'video duration')) # annotations video_annotations = None @@ -1731,6 +1805,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'is_live': is_live, 'start_time': start_time, 'end_time': end_time, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, } @@ -1772,22 +1849,25 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VALID_URL = r"""(?x)(?: (?:https?://)? (?:\w+\.)? - youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ + youtube\.com/ + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) + \? (?:.*?[&;])*? (?:p|a|list)= + | p/ + )| + youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}) )""" - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ @@ -1804,6 +1884,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'YDL_Empty_List', }, 'playlist_count': 0, + 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -1835,9 +1916,10 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', }, 'playlist_count': 2, + 'skip': 'This playlist is private', }, { 'note': 'embedded', - 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', 'playlist_count': 4, 'info_dict': { 'title': 'JODA15', @@ -1845,7 +1927,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): } }, { 'note': 'Embedded SWF player', - 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', + 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', 'playlist_count': 4, 'info_dict': { 'title': 'JODA7', @@ -1858,7 +1940,56 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', }, - 'playlist_mincout': 21, + 'playlist_mincount': 21, + }, { + # Playlist URL that does not actually serve a playlist + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', + 'info_dict': { + 'id': 'yeWKywCrFtk', + 'ext': 'mp4', + 'title': 'Small Scale Baler and Braiding Rugs', + 'uploader': 'Backus-Page House Museum', + 'uploader_id': 'backuspagemuseum', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', + 'upload_date': '20161008', + 'license': 'Standard YouTube License', + 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', + 'categories': ['Nonprofits & Activism'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + }, { + 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', + 'only_matching': True, + }, { + 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', + 'only_matching': True, }] def _real_initialize(self): @@ -1900,14 +2031,18 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) - for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page): + # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604) + for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): match = match.strip() # Check if the playlist exists or is private - if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match): - raise ExtractorError( - 'The playlist doesn\'t exist or is private, use --username or ' - '--netrc to access it.', - expected=True) + mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) + if mobj: + reason = mobj.group('reason') + message = 'This playlist %s' % reason + if 'private' in reason: + message += ', use --username or --netrc to access it' + message += '.' + raise ExtractorError(message, expected=True) elif re.match(r'[^<]*Invalid parameters[^<]*', match): raise ExtractorError( 'Invalid parameters. Maybe URL is incorrect.', @@ -1919,20 +2054,35 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): playlist_title = self._html_search_regex( r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', - page, 'title') + page, 'title', default=None) - return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) + has_videos = True + + if not playlist_title: + try: + # Some playlist URLs don't actually serve a playlist (e.g. + # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) + next(self._entries(page, playlist_id)) + except StopIteration: + has_videos = False + + return has_videos, self.playlist_result( + self._entries(page, playlist_id), playlist_id, playlist_title) def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - if 'v' in query_dict: - video_id = query_dict['v'][0] + video_id = query_dict.get('v', [None])[0] or self._search_regex( + r'(?:^|//)youtu\.be/([0-9A-Za-z_-]{11})', url, + 'video id', default=None) + if video_id: if self._downloader.params.get('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) + return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + return video_id, None + return None, None def _real_extract(self, url): # Extract playlist id @@ -1941,7 +2091,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): raise ExtractorError('Invalid URL: %s' % url) playlist_id = mobj.group(1) or mobj.group(2) - video = self._check_download_just_video(url, playlist_id) + video_id, video = self._check_download_just_video(url, playlist_id) if video: return video @@ -1949,7 +2099,15 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): # Mixes require a custom extraction process return self._extract_mix(playlist_id) - return self._extract_playlist(playlist_id) + has_videos, playlist = self._extract_playlist(playlist_id) + if has_videos or not video_id: + return playlist + + # Some playlist URLs don't actually serve a playlist (see + # https://github.com/rg3/youtube-dl/issues/10537). + # Fallback to plain video extraction if there is a video id + # along with playlist id. + return self.url_result(video_id, 'Youtube', video_id=video_id) class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): @@ -2097,18 +2255,18 @@ class YoutubeUserIE(YoutubeChannelIE): class YoutubeLiveIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+))/live' + _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' IE_NAME = 'youtube:live' _TESTS = [{ - 'url': 'http://www.youtube.com/user/TheYoungTurks/live', + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', 'info_dict': { 'id': 'a48o2S1cPoo', 'ext': 'mp4', 'title': 'The Young Turks - Live Main Show', 'uploader': 'The Young Turks', 'uploader_id': 'TheYoungTurks', - 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', 'upload_date': '20150715', 'license': 'Standard YouTube License', 'description': 'md5:438179573adcdff3c97ebb1ee632b891', @@ -2121,7 +2279,13 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, { - 'url': 'http://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/TheYoungTurks/live', 'only_matching': True, }] @@ -2146,7 +2310,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_NAME = 'youtube:playlists' _TESTS = [{ - 'url': 'http://www.youtube.com/user/ThirstForScience/playlists', + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', 'playlist_mincount': 4, 'info_dict': { 'id': 'ThirstForScience', @@ -2154,7 +2318,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): }, }, { # with "Load more" button - 'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', 'playlist_mincount': 70, 'info_dict': { 'id': 'igorkle1', @@ -2186,18 +2350,18 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): videos = [] limit = n + url_query = { + 'search_query': query.encode('utf-8'), + } + url_query.update(self._EXTRA_QUERY_ARGS) + result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) + for pagenum in itertools.count(1): - url_query = { - 'search_query': query.encode('utf-8'), - 'page': pagenum, - 'spf': 'navigate', - } - url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) data = self._download_json( result_url, video_id='query "%s"' % query, note='Downloading page %s' % pagenum, - errnote='Unable to download API page') + errnote='Unable to download API page', + query={'spf': 'navigate'}) html_content = data[1]['body']['content'] if 'class="search-message' in html_content: @@ -2209,6 +2373,12 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): videos += new_videos if not new_videos or len(videos) > limit: break + next_link = self._html_search_regex( + r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', + html_content, 'next link', default=None) + if next_link is None: + break + result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) if len(videos) > n: videos = videos[:n] @@ -2247,7 +2417,7 @@ class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' IE_NAME = 'youtube:show' _TESTS = [{ 'url': 'https://www.youtube.com/show/airdisasters', @@ -2316,7 +2486,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeWatchLaterIE(YoutubePlaylistIE): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=WL', @@ -2327,16 +2497,17 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE): }] def _real_extract(self, url): - video = self._check_download_just_video(url, 'WL') + _, video = self._check_download_just_video(url, 'WL') if video: return video - return self._extract_playlist('WL') + _, playlist = self._extract_playlist('WL') + return playlist class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:favorites' IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' _LOGIN_REQUIRED = True def _real_extract(self, url): @@ -2347,21 +2518,21 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history' _PLAYLIST_TITLE = 'Youtube History' @@ -2386,10 +2557,10 @@ class YoutubeTruncatedURLIE(InfoExtractor): ''' _TESTS = [{ - 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041', + 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', 'only_matching': True, }, { - 'url': 'http://www.youtube.com/watch?', + 'url': 'https://www.youtube.com/watch?', 'only_matching': True, }, { 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', @@ -2410,7 +2581,7 @@ class YoutubeTruncatedURLIE(InfoExtractor): 'Did you forget to quote the URL? Remember that & is a meta ' 'character in most shells, so you want to put the URL in quotes, ' 'like youtube-dl ' - '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' + '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' ' or simply youtube-dl BaW_jenozKc .', expected=True) diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py index 22a9a57e8..bacb82eee 100644 --- a/youtube_dl/extractor/zapiks.py +++ b/youtube_dl/extractor/zapiks.py @@ -24,7 +24,7 @@ class ZapiksIE(InfoExtractor): 'ext': 'mp4', 'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!', 'description': 'md5:7054d6f6f620c6519be1fe710d4da847', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 528, 'timestamp': 1359044972, 'upload_date': '20130124', diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 2ef177275..a365923fb 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,262 +1,312 @@ # coding: utf-8 from __future__ import unicode_literals -import functools import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - int_or_none, - unified_strdate, - OnDemandPagedList, - xpath_text, determine_ext, + int_or_none, + NO_DEFAULT, + orderedSet, + parse_codecs, qualities, - float_or_none, - ExtractorError, + try_get, + unified_timestamp, + update_url_query, + urljoin, ) -class ZDFIE(InfoExtractor): - _VALID_URL = r'(?:zdf:|zdf:video:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/(.*beitrag/(?:video/)?))(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' +class ZDFBaseIE(InfoExtractor): + def _call_api(self, url, player, referrer, video_id): + return self._download_json( + url, video_id, 'Downloading JSON content', + headers={ + 'Referer': referrer, + 'Api-Auth': 'Bearer %s' % player['apiToken'], + }) + + def _extract_player(self, webpage, video_id, fatal=True): + return self._parse_json( + self._search_regex( + r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage, + 'player JSON', default='{}' if not fatal else NO_DEFAULT, + group='json'), + video_id) + + +class ZDFIE(ZDFBaseIE): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?]+)\.html' + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') _TESTS = [{ - 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', + 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', 'info_dict': { - 'id': '2037704', - 'ext': 'webm', - 'title': 'ZDFspezial - Ende des Machtpokers', - 'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".', - 'duration': 1022, - 'uploader': 'spezial', - 'uploader_id': '225948', - 'upload_date': '20131127', - }, - 'skip': 'Videos on ZDF.de are depublicised in short order', + 'id': 'zdfmediathek-trailer-100', + 'ext': 'mp4', + 'title': 'Die neue ZDFmediathek', + 'description': 'md5:3003d36487fb9a5ea2d1ff60beb55e8d', + 'duration': 30, + 'timestamp': 1477627200, + 'upload_date': '20161028', + } + }, { + 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', + 'only_matching': True, }] - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - param_groups = {} - for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace')) - params = {} - for param in param_group: - params[param.get('name')] = param.get('value') - param_groups[group_id] = params + @staticmethod + def _extract_subtitles(src): + subtitles = {} + for caption in try_get(src, lambda x: x['captions'], list) or []: + subtitle_url = caption.get('uri') + if subtitle_url and isinstance(subtitle_url, compat_str): + lang = caption.get('language', 'deu') + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, + }) + return subtitles + + def _extract_format(self, video_id, formats, format_urls, meta): + format_url = meta.get('url') + if not format_url or not isinstance(format_url, compat_str): + return + if format_url in format_urls: + return + format_urls.add(format_url) + mime_type = meta.get('mimeType') + ext = determine_ext(format_url) + if mime_type == 'application/x-mpegURL' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', + entry_protocol='m3u8_native', fatal=False)) + elif mime_type == 'application/f4m+xml' or ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)) + else: + f = parse_codecs(meta.get('mimeCodec')) + format_id = ['http'] + for p in (meta.get('type'), meta.get('quality')): + if p and isinstance(p, compat_str): + format_id.append(p) + f.update({ + 'url': format_url, + 'format_id': '-'.join(format_id), + 'format_note': meta.get('quality'), + 'language': meta.get('language'), + 'quality': qualities(self._QUALITIES)(meta.get('quality')), + 'preference': -10, + }) + formats.append(f) + + def _extract_entry(self, url, content, video_id): + title = content.get('title') or content['teaserHeadline'] + + t = content['mainVideoContent']['http://zdf.de/rels/target'] + + ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + + if not ptmd_path: + ptmd_path = t[ + 'http://zdf.de/rels/streams/ptmd-template'].replace( + '{playerId}', 'portal') + + ptmd = self._download_json(urljoin(url, ptmd_path), video_id) formats = [] - for video in smil.findall(self._xpath_ns('.//video', namespace)): - src = video.get('src') - if not src: + track_uris = set() + for p in ptmd['priorityList']: + formitaeten = p.get('formitaeten') + if not isinstance(formitaeten, list): continue - bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - group_id = video.get('paramGroup') - param_group = param_groups[group_id] - for proto in param_group['protocols'].split(','): - formats.append({ - 'url': '%s://%s' % (proto, param_group['host']), - 'app': param_group['app'], - 'play_path': src, - 'ext': 'flv', - 'format_id': '%s-%d' % (proto, bitrate), - 'tbr': bitrate, - }) + for f in formitaeten: + f_qualities = f.get('qualities') + if not isinstance(f_qualities, list): + continue + for quality in f_qualities: + tracks = try_get(quality, lambda x: x['audio']['tracks'], list) + if not tracks: + continue + for track in tracks: + self._extract_format( + video_id, formats, track_uris, { + 'url': track.get('uri'), + 'type': f.get('type'), + 'mimeType': f.get('mimeType'), + 'quality': quality.get('quality'), + 'language': track.get('language'), + }) self._sort_formats(formats) - return formats - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - status_code = doc.find('./status/statuscode') - if status_code is not None and status_code.text != 'ok': - code = status_code.text - if code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, code) - raise ExtractorError(message, expected=True) - - title = doc.find('.//information/title').text - description = xpath_text(doc, './/information/detail', 'description') - duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) - uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') - uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') - upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) - subtitles = {} - captions_url = doc.find('.//caption/url') - if captions_url is not None: - subtitles['de'] = [{ - 'url': captions_url.text, - 'ext': 'ttml', - }] - - def xml_to_thumbnails(fnode): - thumbnails = [] - for node in fnode: - thumbnail_url = node.text - if not thumbnail_url: + thumbnails = [] + layouts = try_get( + content, lambda x: x['teaserImageRef']['layouts'], dict) + if layouts: + for layout_key, layout_url in layouts.items(): + if not isinstance(layout_url, compat_str): continue thumbnail = { - 'url': thumbnail_url, + 'url': layout_url, + 'format_id': layout_key, } - if 'key' in node.attrib: - m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) + mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key) + if mobj: + thumbnail.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + }) thumbnails.append(thumbnail) - return thumbnails - - thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) - - format_nodes = doc.findall('.//formitaeten/formitaet') - quality = qualities(['veryhigh', 'high', 'med', 'low']) - - def get_quality(elem): - return quality(xpath_text(elem, 'quality')) - format_nodes.sort(key=get_quality) - format_ids = [] - formats = [] - for fnode in format_nodes: - video_url = fnode.find('url').text - is_available = 'http://www.metafilegenerator' not in video_url - if not is_available: - continue - format_id = fnode.attrib['basetype'] - quality = xpath_text(fnode, './quality', 'quality') - format_m = re.match(r'''(?x) - (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ - (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - if ext not in ('smil', 'f4m', 'm3u8'): - format_id = format_id + '-' + quality - if format_id in format_ids: - continue - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - # the certificates are misconfigured (see - # https://github.com/rg3/youtube-dl/issues/8665) - if video_url.startswith('https://'): - continue - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - proto = format_m.group('proto').lower() - - abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) - - width = int_or_none(xpath_text(fnode, './width', 'width')) - height = int_or_none(xpath_text(fnode, './height', 'height')) - - filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) - - format_note = '' - if not format_note: - format_note = None - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'width': width, - 'height': height, - 'filesize': filesize, - 'format_note': format_note, - 'protocol': proto, - '_available': is_available, - }) - format_ids.append(format_id) - - self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, + 'description': content.get('leadParagraph') or content.get('teasertext'), + 'duration': int_or_none(t.get('duration')), + 'timestamp': unified_timestamp(content.get('editorialDate')), 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'subtitles': self._extract_subtitles(ptmd), + 'formats': formats, + } + + def _extract_regular(self, url, player, video_id): + content = self._call_api(player['content'], player, url, video_id) + return self._extract_entry(player['content'], content, video_id) + + def _extract_mobile(self, video_id): + document = self._download_json( + 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, + video_id)['document'] + + title = document['titel'] + + formats = [] + format_urls = set() + for f in document['formitaeten']: + self._extract_format(video_id, formats, format_urls, f) + self._sort_formats(formats) + + thumbnails = [] + teaser_bild = document.get('teaserBild') + if isinstance(teaser_bild, dict): + for thumbnail_key, thumbnail in teaser_bild.items(): + thumbnail_url = try_get( + thumbnail, lambda x: x['url'], compat_str) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'id': thumbnail_key, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': document.get('beschreibung'), + 'duration': int_or_none(document.get('length')), + 'timestamp': unified_timestamp(try_get( + document, lambda x: x['meta']['editorialDate'], compat_str)), + 'thumbnails': thumbnails, + 'subtitles': self._extract_subtitles(document), 'formats': formats, - 'subtitles': subtitles, } def _real_extract(self, url): video_id = self._match_id(url) - xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - return self.extract_from_xml_url(video_id, xml_url) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + player = self._extract_player(webpage, url, fatal=False) + if player: + return self._extract_regular(url, player, video_id) + + return self._extract_mobile(video_id) -class ZDFChannelIE(InfoExtractor): - _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/(?:[^/]+/)?)(?P<id>[0-9]+)' +class ZDFChannelIE(ZDFBaseIE): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic', + 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { - 'id': '1586442', + 'id': 'das-aktuelle-sportstudio', + 'title': 'das aktuelle sportstudio | ZDF', }, - 'playlist_count': 3, + 'playlist_count': 21, }, { - 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/aktuellste/332', - 'only_matching': True, + 'url': 'https://www.zdf.de/dokumentation/planet-e', + 'info_dict': { + 'id': 'planet-e', + 'title': 'planet e.', + }, + 'playlist_count': 4, }, { - 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/meist-gesehen/332', - 'only_matching': True, - }, { - 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/_/1798716?bc=nrt;nrm?flash=off', + 'url': 'https://www.zdf.de/filme/taunuskrimi/', 'only_matching': True, }] - _PAGE_SIZE = 50 - def _fetch_page(self, channel_id, page): - offset = page * self._PAGE_SIZE - xml_url = ( - 'http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s' - % (offset, self._PAGE_SIZE, channel_id)) - doc = self._download_xml( - xml_url, channel_id, - note='Downloading channel info', - errnote='Failed to download channel info') - - title = doc.find('.//information/title').text - description = doc.find('.//information/detail').text - for asset in doc.findall('.//teasers/teaser'): - a_type = asset.find('./type').text - a_id = asset.find('./details/assetId').text - if a_type not in ('video', 'topic'): - continue - yield { - '_type': 'url', - 'playlist_title': title, - 'playlist_description': description, - 'url': 'zdf:%s:%s' % (a_type, a_id), - } + @classmethod + def suitable(cls, url): + return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url) def _real_extract(self, url): channel_id = self._match_id(url) - entries = OnDemandPagedList( - functools.partial(self._fetch_page, channel_id), self._PAGE_SIZE) - return { - '_type': 'playlist', - 'id': channel_id, - 'entries': entries, - } + webpage = self._download_webpage(url, channel_id) + + entries = [ + self.url_result(item_url, ie=ZDFIE.ie_key()) + for item_url in orderedSet(re.findall( + r'data-plusbar-url=["\'](http.+?\.html)', webpage))] + + return self.playlist_result( + entries, channel_id, self._og_search_title(webpage, fatal=False)) + + r""" + player = self._extract_player(webpage, channel_id) + + channel_id = self._search_regex( + r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage, + 'channel id', group='id') + + channel = self._call_api( + 'https://api.zdf.de/content/documents/%s.json' % channel_id, + player, url, channel_id) + + items = [] + for module in channel['module']: + for teaser in try_get(module, lambda x: x['teaser'], list) or []: + t = try_get( + teaser, lambda x: x['http://zdf.de/rels/target'], dict) + if not t: + continue + items.extend(try_get( + t, + lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], + list) or []) + items.extend(try_get( + module, + lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], + list) or []) + + entries = [] + entry_urls = set() + for item in items: + t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) + if not t: + continue + sharing_url = t.get('http://zdf.de/rels/sharing-url') + if not sharing_url or not isinstance(sharing_url, compat_str): + continue + if sharing_url in entry_urls: + continue + entry_urls.add(sharing_url) + entries.append(self.url_result( + sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) + + return self.playlist_result(entries, channel_id, channel.get('title')) + """ diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index 437eecb67..adfdcaabf 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -1,16 +1,20 @@ -# coding=utf-8 +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + update_url_query, +) class ZingMp3BaseInfoExtractor(InfoExtractor): - def _extract_item(self, item, fatal=True): - error_message = item.find('./errormessage').text + def _extract_item(self, item, page_type, fatal=True): + error_message = item.get('msg') if error_message: if not fatal: return @@ -18,25 +22,48 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) - title = item.find('./title').text.strip() - source = item.find('./source').text - extension = item.attrib['type'] - thumbnail = item.find('./backimage').text + formats = [] + for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])): + if not source_url or source_url == 'require vip': + continue + if not re.match(r'https?://', source_url): + source_url = '//' + source_url + source_url = self._proto_relative_url(source_url, 'http:') + quality_num = int_or_none(quality) + f = { + 'format_id': quality, + 'url': source_url, + } + if page_type == 'video': + f.update({ + 'height': quality_num, + 'ext': 'mp4', + }) + else: + f.update({ + 'abr': quality_num, + 'ext': 'mp3', + }) + formats.append(f) + + cover = item.get('cover') return { - 'title': title, - 'url': source, - 'ext': extension, - 'thumbnail': thumbnail, + 'title': (item.get('name') or item.get('title')).strip(), + 'formats': formats, + 'thumbnail': 'http:/' + cover if cover else None, + 'artist': item.get('artist'), } - def _extract_player_xml(self, player_xml_url, id, playlist_title=None): - player_xml = self._download_xml(player_xml_url, id, 'Downloading Player XML') - items = player_xml.findall('./item') + def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None): + player_json = self._download_json(player_json_url, id, 'Downloading Player JSON') + items = player_json['data'] + if 'item' in items: + items = items['item'] if len(items) == 1: # one single song - data = self._extract_item(items[0]) + data = self._extract_item(items[0], page_type) data['id'] = id return data @@ -45,7 +72,7 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): entries = [] for i, item in enumerate(items, 1): - entry = self._extract_item(item, fatal=False) + entry = self._extract_item(item, page_type, fatal=False) if not entry: continue entry['id'] = '%s-%d' % (id, i) @@ -59,8 +86,8 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): } -class ZingMp3SongIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/bai-hat/(?P<slug>[^/]+)/(?P<song_id>\w+)\.html' +class ZingMp3IE(ZingMp3BaseInfoExtractor): + _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P<id>\w+)\.html' _TESTS = [{ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -68,53 +95,49 @@ class ZingMp3SongIE(ZingMp3BaseInfoExtractor): 'id': 'ZWZB9WAB', 'title': 'Xa Mãi Xa', 'ext': 'mp3', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, - }] - IE_NAME = 'zingmp3:song' - IE_DESC = 'mp3.zing.vn songs' - - def _real_extract(self, url): - matched = re.match(self._VALID_URL, url) - slug = matched.group('slug') - song_id = matched.group('song_id') - - webpage = self._download_webpage( - 'http://mp3.zing.vn/bai-hat/%s/%s.html' % (slug, song_id), song_id) - - player_xml_url = self._search_regex( - r'&xmlURL=(?P<xml_url>[^&]+)&', webpage, 'player xml url') - - return self._extract_player_xml(player_xml_url, song_id) - - -class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/(?:album|playlist)/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html' - _TESTS = [{ + }, { + 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html', + 'md5': '870295a9cd8045c0e15663565902618d', + 'info_dict': { + 'id': 'ZW6BAEA0', + 'title': 'Let It Go (Frozen OST)', + 'ext': 'mp4', + }, + }, { 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', 'info_dict': { '_type': 'playlist', 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless', + 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless', }, 'playlist_count': 10, + 'skip': 'removed at the request of the owner', }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, }] - IE_NAME = 'zingmp3:album' - IE_DESC = 'mp3.zing.vn albums' + IE_NAME = 'zingmp3' + IE_DESC = 'mp3.zing.vn' def _real_extract(self, url): - matched = re.match(self._VALID_URL, url) - slug = matched.group('slug') - album_id = matched.group('album_id') + page_id = self._match_id(url) - webpage = self._download_webpage( - 'http://mp3.zing.vn/album/%s/%s.html' % (slug, album_id), album_id) - player_xml_url = self._search_regex( - r'&xmlURL=(?P<xml_url>[^&]+)&', webpage, 'player xml url') + webpage = self._download_webpage(url, page_id) - return self._extract_player_xml( - player_xml_url, album_id, - playlist_title=self._og_search_title(webpage)) + player_json_url = self._search_regex([ + r'data-xml="([^"]+)', + r'&xmlURL=([^&]+)&' + ], webpage, 'player xml url') + + playlist_title = None + page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type') + if page_type == 'video': + player_json_url = update_url_query(player_json_url, {'format': 'json'}) + else: + player_json_url = player_json_url.replace('/xml/', '/html5xml/') + if page_type == 'album': + playlist_title = self._og_search_title(webpage) + + return self._extract_player_json(player_json_url, page_id, page_type, playlist_title) diff --git a/youtube_dl/extractor/zippcast.py b/youtube_dl/extractor/zippcast.py deleted file mode 100644 index de819376d..000000000 --- a/youtube_dl/extractor/zippcast.py +++ /dev/null @@ -1,94 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - str_to_int, -) - - -class ZippCastIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zippcast\.com/(?:video/|videoview\.php\?.*\bvplay=)(?P<id>[0-9a-zA-Z]+)' - _TESTS = [{ - # m3u8, hq direct link - 'url': 'http://www.zippcast.com/video/c9cfd5c7e44dbc29c81', - 'md5': '5ea0263b5606866c4d6cda0fc5e8c6b6', - 'info_dict': { - 'id': 'c9cfd5c7e44dbc29c81', - 'ext': 'mp4', - 'title': '[Vinesauce] Vinny - Digital Space Traveler', - 'description': 'Muted on youtube, but now uploaded in it\'s original form.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'vinesauce', - 'view_count': int, - 'categories': ['Entertainment'], - 'tags': list, - }, - }, { - # f4m, lq ipod direct link - 'url': 'http://www.zippcast.com/video/b79c0a233e9c6581775', - 'only_matching': True, - }, { - 'url': 'http://www.zippcast.com/videoview.php?vplay=c9cfd5c7e44dbc29c81&auto=no', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://www.zippcast.com/video/%s' % video_id, video_id) - - formats = [] - video_url = self._search_regex( - r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage, - 'video url', default=None, group='url') - if video_url: - formats.append({ - 'url': video_url, - 'format_id': 'http', - 'preference': 0, # direct link is almost always of worse quality - }) - src_url = self._search_regex( - r'src\s*:\s*(?:escape\()?(["\'])(?P<url>http://.+?)\1', - webpage, 'src', default=None, group='url') - ext = determine_ext(src_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src_url, video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) or self._html_search_meta( - 'description', webpage) - uploader = self._search_regex( - r'<a[^>]+href="https?://[^/]+/profile/[^>]+>([^<]+)</a>', - webpage, 'uploader', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - view_count = str_to_int(self._search_regex( - r'>([\d,.]+) views!', webpage, 'view count', fatal=False)) - - categories = re.findall( - r'<a[^>]+href="https?://[^/]+/categories/[^"]+">([^<]+),?<', - webpage) - tags = re.findall( - r'<a[^>]+href="https?://[^/]+/search/tags/[^"]+">([^<]+),?<', - webpage) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'view_count': view_count, - 'categories': categories, - 'tags': tags, - 'formats': formats, - } diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 9737f7002..24cdec28c 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -198,12 +198,12 @@ class JSInterpreter(object): return opfunc(x, y) m = re.match( - r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]+)\)$' % _NAME_RE, expr) + r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) if m: fname = m.group('func') argvals = tuple([ int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')]) + for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple() if fname not in self._functions: self._functions[fname] = self.extract_function(fname) return self._functions[fname](argvals) @@ -213,7 +213,7 @@ class JSInterpreter(object): def extract_object(self, objname): obj = {} obj_m = re.search( - (r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) + + (r'(?<!this\.)%s\s*=\s*\{' % re.escape(objname)) + r'\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' + r'\}\s*;', self.code) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 942d44912..349f44778 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -94,7 +94,7 @@ def parseOpts(overrideArguments=None): setattr(parser.values, option.dest, value.split(',')) def _hide_login_info(opts): - PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password'] + PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'] eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') def _scrub_eq(o): @@ -178,6 +178,10 @@ def parseOpts(overrideArguments=None): 'When given in the global configuration file /etc/youtube-dl.conf: ' 'Do not read the user configuration in ~/.config/youtube-dl/config ' '(%APPDATA%/youtube-dl/config.txt on Windows)') + general.add_option( + '--config-location', + dest='config_location', metavar='PATH', + help='Location of the configuration file; either the path to the config or its containing directory.') general.add_option( '--flat-playlist', action='store_const', dest='extract_flat', const='in_playlist', @@ -212,23 +216,23 @@ def parseOpts(overrideArguments=None): network.add_option( '--source-address', metavar='IP', dest='source_address', default=None, - help='Client-side IP address to bind to (experimental)', + help='Client-side IP address to bind to', ) network.add_option( '-4', '--force-ipv4', action='store_const', const='0.0.0.0', dest='source_address', - help='Make all connections via IPv4 (experimental)', + help='Make all connections via IPv4', ) network.add_option( '-6', '--force-ipv6', action='store_const', const='::', dest='source_address', - help='Make all connections via IPv6 (experimental)', + help='Make all connections via IPv6', ) network.add_option( '--geo-verification-proxy', dest='geo_verification_proxy', default=None, metavar='URL', help='Use this proxy to verify the IP address for some geo-restricted sites. ' - 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' + 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading.' ) network.add_option( '--cn-verification-proxy', @@ -293,7 +297,7 @@ def parseOpts(overrideArguments=None): '--match-filter', metavar='FILTER', dest='match_filter', default=None, help=( - 'Generic video filter (experimental). ' + 'Generic video filter. ' 'Specify any key (see help for -o for a list of available keys) to' ' match if the key is present, ' '!key to check if the key is not present,' @@ -341,7 +345,7 @@ def parseOpts(overrideArguments=None): authentication.add_option( '-2', '--twofactor', dest='twofactor', metavar='TWOFACTOR', - help='Two-factor auth code') + help='Two-factor authentication code') authentication.add_option( '-n', '--netrc', action='store_true', dest='usenetrc', default=False, @@ -351,6 +355,24 @@ def parseOpts(overrideArguments=None): dest='videopassword', metavar='PASSWORD', help='Video password (vimeo, smotri, youku)') + adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options') + adobe_pass.add_option( + '--ap-mso', + dest='ap_mso', metavar='MSO', + help='Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for a list of available MSOs') + adobe_pass.add_option( + '--ap-username', + dest='ap_username', metavar='USERNAME', + help='Multiple-system operator account login') + adobe_pass.add_option( + '--ap-password', + dest='ap_password', metavar='PASSWORD', + help='Multiple-system operator account password. If this option is left out, youtube-dl will ask interactively.') + adobe_pass.add_option( + '--ap-list-mso', + action='store_true', dest='ap_list_mso', default=False, + help='List all supported multiple-system operators') + video_format = optparse.OptionGroup(parser, 'Video Format Options') video_format.add_option( '-f', '--format', @@ -423,7 +445,15 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--fragment-retries', dest='fragment_retries', metavar='RETRIES', default=10, - help='Number of retries for a fragment (default is %default), or "infinite" (DASH only)') + help='Number of retries for a fragment (default is %default), or "infinite" (DASH and hlsnative only)') + downloader.add_option( + '--skip-unavailable-fragments', + action='store_true', dest='skip_unavailable_fragments', default=True, + help='Skip unavailable fragments (DASH and hlsnative only)') + downloader.add_option( + '--abort-on-unavailable-fragment', + action='store_false', dest='skip_unavailable_fragments', + help='Abort downloading when some fragment is not available') downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', @@ -440,10 +470,14 @@ def parseOpts(overrideArguments=None): '--playlist-reverse', action='store_true', help='Download playlist videos in reverse order') + downloader.add_option( + '--playlist-random', + action='store_true', + help='Download playlist videos in random order') downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', - help='Set file xattribute ytdl.filesize with expected filesize (experimental)') + help='Set file xattribute ytdl.filesize with expected file size (experimental)') downloader.add_option( '--hls-prefer-native', dest='hls_prefer_native', action='store_true', default=None, @@ -499,9 +533,20 @@ def parseOpts(overrideArguments=None): dest='bidi_workaround', action='store_true', help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') workarounds.add_option( - '--sleep-interval', metavar='SECONDS', + '--sleep-interval', '--min-sleep-interval', metavar='SECONDS', dest='sleep_interval', type=float, - help='Number of seconds to sleep before each download.') + help=( + 'Number of seconds to sleep before each download when used alone ' + 'or a lower bound of a range for randomized sleep before each download ' + '(minimum possible number of seconds to sleep) when used along with ' + '--max-sleep-interval.')) + workarounds.add_option( + '--max-sleep-interval', metavar='SECONDS', + dest='max_sleep_interval', type=float, + help=( + 'Upper bound of a range for randomized sleep before each download ' + '(maximum possible number of seconds to sleep). Must only be used ' + 'along with --min-sleep-interval.')) verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') verbosity.add_option( @@ -617,26 +662,15 @@ def parseOpts(overrideArguments=None): filesystem.add_option( '-o', '--output', dest='outtmpl', metavar='TEMPLATE', - help=('Output filename template. Use %(title)s to get the title, ' - '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, ' - '%(autonumber)s to get an automatically incremented number, ' - '%(ext)s for the filename extension, ' - '%(format)s for the format description (like "22 - 1280x720" or "HD"), ' - '%(format_id)s for the unique id of the format (like YouTube\'s itags: "137"), ' - '%(upload_date)s for the upload date (YYYYMMDD), ' - '%(extractor)s for the provider (youtube, metacafe, etc), ' - '%(id)s for the video id, ' - '%(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, ' - '%(playlist_index)s for the position in the playlist. ' - '%(height)s and %(width)s for the width and height of the video format. ' - '%(resolution)s for a textual description of the resolution of the video format. ' - '%% for a literal percent. ' - 'Use - to output to stdout. Can also be used to download to a different directory, ' - 'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .')) + help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) filesystem.add_option( '--autonumber-size', - dest='autonumber_size', metavar='NUMBER', - help='Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given') + dest='autonumber_size', metavar='NUMBER', default=5, type=int, + help='Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given (default is %default)') + filesystem.add_option( + '--autonumber-start', + dest='autonumber_start', metavar='NUMBER', default=1, type=int, + help='Specify the start value for %(autonumber)s (default is %default)') filesystem.add_option( '--restrict-filenames', action='store_true', dest='restrictfilenames', default=False, @@ -725,7 +759,7 @@ def parseOpts(overrideArguments=None): help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', - help='Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "%default" by default') + help='Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "%default" by default; No effect without -x') postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', @@ -809,6 +843,7 @@ def parseOpts(overrideArguments=None): parser.add_option_group(video_format) parser.add_option_group(subtitles) parser.add_option_group(authentication) + parser.add_option_group(adobe_pass) parser.add_option_group(postproc) if overrideArguments is not None: @@ -822,22 +857,32 @@ def parseOpts(overrideArguments=None): return conf command_line_conf = compat_conf(sys.argv[1:]) + opts, args = parser.parse_args(command_line_conf) - if '--ignore-config' in command_line_conf: - system_conf = [] - user_conf = [] + system_conf = user_conf = custom_conf = [] + + if '--config-location' in command_line_conf: + location = compat_expanduser(opts.config_location) + if os.path.isdir(location): + location = os.path.join(location, 'youtube-dl.conf') + if not os.path.exists(location): + parser.error('config-location %s does not exist.' % location) + custom_conf = _readOptions(location) + elif '--ignore-config' in command_line_conf: + pass else: system_conf = _readOptions('/etc/youtube-dl.conf') - if '--ignore-config' in system_conf: - user_conf = [] - else: + if '--ignore-config' not in system_conf: user_conf = _readUserConf() - argv = system_conf + user_conf + command_line_conf + argv = system_conf + user_conf + custom_conf + command_line_conf opts, args = parser.parse_args(argv) if opts.verbose: - write_string('[debug] System config: ' + repr(_hide_login_info(system_conf)) + '\n') - write_string('[debug] User config: ' + repr(_hide_login_info(user_conf)) + '\n') - write_string('[debug] Command-line args: ' + repr(_hide_login_info(command_line_conf)) + '\n') + for conf_label, conf in ( + ('System config', system_conf), + ('User config', user_conf), + ('Custom config', custom_conf), + ('Command-line args', command_line_conf)): + write_string('[debug] %s: %s\n' % (conf_label, repr(_hide_login_info(conf)))) return parser, opts, args diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 3bad5a266..e606a58de 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals @@ -40,7 +40,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): 'Skipping embedding the thumbnail because the file is missing.') return [], info - if info['ext'] in ('mp3', 'mkv'): + if info['ext'] == 'mp3': options = [ '-c', 'copy', '-map', '0', '-map', '1', '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"'] diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index fa99b0c2a..1881f4849 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -139,6 +139,30 @@ class FFmpegPostProcessor(PostProcessor): def probe_executable(self): return self._paths[self.probe_basename] + def get_audio_codec(self, path): + if not self.probe_available: + raise PostProcessingError('ffprobe or avprobe not found. Please install one.') + try: + cmd = [ + encodeFilename(self.probe_executable, True), + encodeArgument('-show_streams'), + encodeFilename(self._ffmpeg_filename_argument(path), True)] + if self._downloader.params.get('verbose', False): + self._downloader.to_screen('[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) + handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE, stdin=subprocess.PIPE) + output = handle.communicate()[0] + if handle.wait() != 0: + return None + except (IOError, OSError): + return None + audio_codec = None + for line in output.decode('ascii', 'ignore').split('\n'): + if line.startswith('codec_name='): + audio_codec = line.split('=')[1].strip() + elif line.strip() == 'codec_type=audio' and audio_codec is not None: + return audio_codec + return None + def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): self.check_version() @@ -188,31 +212,6 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): self._preferredquality = preferredquality self._nopostoverwrites = nopostoverwrites - def get_audio_codec(self, path): - - if not self.probe_available: - raise PostProcessingError('ffprobe or avprobe not found. Please install one.') - try: - cmd = [ - encodeFilename(self.probe_executable, True), - encodeArgument('-show_streams'), - encodeFilename(self._ffmpeg_filename_argument(path), True)] - if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) - handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE, stdin=subprocess.PIPE) - output = handle.communicate()[0] - if handle.wait() != 0: - return None - except (IOError, OSError): - return None - audio_codec = None - for line in output.decode('ascii', 'ignore').split('\n'): - if line.startswith('codec_name='): - audio_codec = line.split('=')[1].strip() - elif line.strip() == 'codec_type=audio' and audio_codec is not None: - return audio_codec - return None - def run_ffmpeg(self, path, out_path, codec, more_opts): if codec is None: acodec_opts = [] @@ -280,6 +279,9 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): prefix, sep, ext = path.rpartition('.') # not os.path.splitext, since the latter does not work on unicode in all setups new_path = prefix + sep + extension + information['filepath'] = new_path + information['ext'] = extension + # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly. if (new_path == path or (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))): @@ -301,9 +303,6 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): new_path, time.time(), information['filetime'], errnote='Cannot update utime of audio file') - information['filepath'] = new_path - information['ext'] = extension - return [path], information @@ -504,15 +503,15 @@ class FFmpegFixupM4aPP(FFmpegPostProcessor): class FFmpegFixupM3u8PP(FFmpegPostProcessor): def run(self, info): filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') + if self.get_audio_codec(filename) == 'aac': + temp_filename = prepend_extension(filename, 'temp') - options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] - self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename) - self.run_ffmpeg(filename, temp_filename, options) - - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename) + self.run_ffmpeg(filename, temp_filename, options) + os.remove(encodeFilename(filename)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) return [], info diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py index 42377fa0f..164edd3a8 100644 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -3,11 +3,6 @@ from __future__ import unicode_literals import re from .common import PostProcessor -from ..utils import PostProcessingError - - -class MetadataFromTitlePPError(PostProcessingError): - pass class MetadataFromTitlePP(PostProcessor): @@ -17,7 +12,7 @@ class MetadataFromTitlePP(PostProcessor): self._titleregex = self.format_to_regex(titleformat) def format_to_regex(self, fmt): - """ + r""" Converts a string like '%(title)s - %(artist)s' to a regex like @@ -38,7 +33,8 @@ class MetadataFromTitlePP(PostProcessor): title = info['title'] match = re.match(self._titleregex, title) if match is None: - raise MetadataFromTitlePPError('Could not interpret title of video as "%s"' % self._titleformat) + self._downloader.to_screen('[fromtitle] Could not interpret title of video as "%s"' % self._titleformat) + return [], info for attribute, value in match.groupdict().items(): value = match.group(attribute) info[attribute] = value diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index e39ca60aa..fbdfa02ac 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -1,37 +1,15 @@ from __future__ import unicode_literals -import os -import subprocess -import sys -import errno - from .common import PostProcessor from ..compat import compat_os_name from ..utils import ( - check_executable, hyphenate_date, - version_tuple, - PostProcessingError, - encodeArgument, - encodeFilename, + write_xattr, + XAttrMetadataError, + XAttrUnavailableError, ) -class XAttrMetadataError(PostProcessingError): - def __init__(self, code=None, msg='Unknown error'): - super(XAttrMetadataError, self).__init__(msg) - self.code = code - - # Parsing code and msg - if (self.code in (errno.ENOSPC, errno.EDQUOT) or - 'No space left' in self.msg or 'Disk quota excedded' in self.msg): - self.reason = 'NO_SPACE' - elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: - self.reason = 'VALUE_TOO_LONG' - else: - self.reason = 'NOT_SUPPORTED' - - class XAttrMetadataPP(PostProcessor): # @@ -48,88 +26,6 @@ class XAttrMetadataPP(PostProcessor): def run(self, info): """ Set extended attributes on downloaded file (if xattr support is found). """ - # This mess below finds the best xattr tool for the job and creates a - # "write_xattr" function. - try: - # try the pyxattr module... - import xattr - - # Unicode arguments are not supported in python-pyxattr until - # version 0.5.0 - # See https://github.com/rg3/youtube-dl/issues/5498 - pyxattr_required_version = '0.5.0' - if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version): - self._downloader.report_warning( - 'python-pyxattr is detected but is too old. ' - 'youtube-dl requires %s or above while your version is %s. ' - 'Falling back to other xattr implementations' % ( - pyxattr_required_version, xattr.__version__)) - - raise ImportError - - def write_xattr(path, key, value): - try: - xattr.set(path, key, value) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - - except ImportError: - if compat_os_name == 'nt': - # Write xattrs to NTFS Alternate Data Streams: - # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 - def write_xattr(path, key, value): - assert ':' not in key - assert os.path.exists(path) - - ads_fn = path + ':' + key - try: - with open(ads_fn, 'wb') as f: - f.write(value) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - else: - user_has_setfattr = check_executable('setfattr', ['--version']) - user_has_xattr = check_executable('xattr', ['-h']) - - if user_has_setfattr or user_has_xattr: - - def write_xattr(path, key, value): - value = value.decode('utf-8') - if user_has_setfattr: - executable = 'setfattr' - opts = ['-n', key, '-v', value] - elif user_has_xattr: - executable = 'xattr' - opts = ['-w', key, value] - - cmd = ([encodeFilename(executable, True)] + - [encodeArgument(o) for o in opts] + - [encodeFilename(path, True)]) - - try: - p = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - stdout, stderr = p.communicate() - stderr = stderr.decode('utf-8', 'replace') - if p.returncode != 0: - raise XAttrMetadataError(p.returncode, stderr) - - else: - # On Unix, and can't find pyxattr, setfattr, or xattr. - if sys.platform.startswith('linux'): - self._downloader.report_error( - "Couldn't find a tool to set the xattrs. " - "Install either the python 'pyxattr' or 'xattr' " - "modules, or the GNU 'attr' package " - "(which contains the 'setfattr' tool).") - else: - self._downloader.report_error( - "Couldn't find a tool to set the xattrs. " - "Install either the python 'xattr' module, " - "or the 'xattr' binary.") - # Write the metadata to the file's xattrs self._downloader.to_screen('[metadata] Writing metadata to file\'s xattrs') @@ -159,6 +55,10 @@ class XAttrMetadataPP(PostProcessor): return [], info + except XAttrUnavailableError as e: + self._downloader.report_error(str(e)) + return [], info + except XAttrMetadataError as e: if e.reason == 'NO_SPACE': self._downloader.report_warning( diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index 104807242..0f5d7bdb2 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -55,12 +55,12 @@ class Socks5AddressType(object): ATYP_IPV6 = 0x04 -class ProxyError(IOError): +class ProxyError(socket.error): ERR_SUCCESS = 0x00 def __init__(self, code=None, msg=None): if code is not None and msg is None: - msg = self.CODES.get(code) and 'unknown error' + msg = self.CODES.get(code) or 'unknown error' super(ProxyError, self).__init__(code, msg) @@ -103,6 +103,7 @@ class ProxyType(object): SOCKS4A = 1 SOCKS5 = 2 + Proxy = collections.namedtuple('Proxy', ( 'type', 'host', 'port', 'username', 'password', 'remote_dns')) @@ -122,7 +123,7 @@ class sockssocket(socket.socket): while len(data) < cnt: cur = self.recv(cnt - len(data)) if not cur: - raise IOError('{0} bytes missing'.format(cnt - len(data))) + raise EOFError('{0} bytes missing'.format(cnt - len(data))) data += cur return data diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 7cf490aa4..0c7158575 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -115,6 +115,8 @@ def _u30(reader): res = _read_int(reader) assert res & 0xf0000000 == 0 return res + + _u32 = _read_int @@ -176,6 +178,7 @@ class _Undefined(object): return 'undefined' __repr__ = __str__ + undefined = _Undefined() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 97ddd9883..67a847eba 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals @@ -42,11 +42,13 @@ from .compat import ( compat_html_entities_html5, compat_http_client, compat_kwargs, + compat_os_name, compat_parse_qs, compat_shlex_quote, compat_socket_create_connection, compat_str, compat_struct_pack, + compat_struct_unpack, compat_urllib_error, compat_urllib_parse, compat_urllib_parse_urlencode, @@ -84,12 +86,24 @@ std_headers = { } +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + + NO_DEFAULT = object() ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] +MONTH_NAMES = { + 'en': ENGLISH_MONTH_NAMES, + 'fr': [ + 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', + 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], +} + KNOWN_EXTENSIONS = ( 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', 'flv', 'f4v', 'f4a', 'f4b', @@ -114,14 +128,22 @@ DATE_FORMATS = ( '%d %B %Y', '%d %b %Y', '%B %d %Y', + '%B %dst %Y', + '%B %dnd %Y', + '%B %dth %Y', '%b %d %Y', + '%b %dst %Y', + '%b %dnd %Y', + '%b %dth %Y', '%b %dst %Y %I:%M', '%b %dnd %Y %I:%M', '%b %dth %Y %I:%M', '%Y %m %d', '%Y-%m-%d', '%Y/%m/%d', + '%Y/%m/%d %H:%M', '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', @@ -132,6 +154,8 @@ DATE_FORMATS = ( '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M', + '%b %d %Y at %H:%M', + '%b %d %Y at %H:%M:%S', ) DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) @@ -153,6 +177,8 @@ DATE_FORMATS_MONTH_FIRST.extend([ '%m/%d/%Y %H:%M:%S', ]) +PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" + def preferredencoding(): """Get preferred encoding. @@ -482,7 +508,7 @@ def sanitize_path(s): if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) @@ -766,6 +792,26 @@ class ContentTooShortError(Exception): self.expected = expected +class XAttrMetadataError(Exception): + def __init__(self, code=None, msg='Unknown error'): + super(XAttrMetadataError, self).__init__(msg) + self.code = code + self.msg = msg + + # Parsing code and msg + if (self.code in (errno.ENOSPC, errno.EDQUOT) or + 'No space left' in self.msg or 'Disk quota excedded' in self.msg): + self.reason = 'NO_SPACE' + elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: + self.reason = 'VALUE_TOO_LONG' + else: + self.reason = 'NOT_SUPPORTED' + + +class XAttrUnavailableError(Exception): + pass + + def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting # expected HTTP responses to meet HTTP/1.0 or later (see also @@ -1144,7 +1190,7 @@ def date_from_str(date_str): return today if date_str == 'yesterday': return today - datetime.timedelta(days=1) - match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str) + match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str) if match is not None: sign = match.group('sign') time = int(match.group('time')) @@ -1502,38 +1548,63 @@ def parse_filesize(s): _UNIT_TABLE = { 'B': 1, 'b': 1, + 'bytes': 1, 'KiB': 1024, 'KB': 1000, 'kB': 1024, 'Kb': 1000, + 'kb': 1000, + 'kilobytes': 1000, + 'kibibytes': 1024, 'MiB': 1024 ** 2, 'MB': 1000 ** 2, 'mB': 1024 ** 2, 'Mb': 1000 ** 2, + 'mb': 1000 ** 2, + 'megabytes': 1000 ** 2, + 'mebibytes': 1024 ** 2, 'GiB': 1024 ** 3, 'GB': 1000 ** 3, 'gB': 1024 ** 3, 'Gb': 1000 ** 3, + 'gb': 1000 ** 3, + 'gigabytes': 1000 ** 3, + 'gibibytes': 1024 ** 3, 'TiB': 1024 ** 4, 'TB': 1000 ** 4, 'tB': 1024 ** 4, 'Tb': 1000 ** 4, + 'tb': 1000 ** 4, + 'terabytes': 1000 ** 4, + 'tebibytes': 1024 ** 4, 'PiB': 1024 ** 5, 'PB': 1000 ** 5, 'pB': 1024 ** 5, 'Pb': 1000 ** 5, + 'pb': 1000 ** 5, + 'petabytes': 1000 ** 5, + 'pebibytes': 1024 ** 5, 'EiB': 1024 ** 6, 'EB': 1000 ** 6, 'eB': 1024 ** 6, 'Eb': 1000 ** 6, + 'eb': 1000 ** 6, + 'exabytes': 1000 ** 6, + 'exbibytes': 1024 ** 6, 'ZiB': 1024 ** 7, 'ZB': 1000 ** 7, 'zB': 1024 ** 7, 'Zb': 1000 ** 7, + 'zb': 1000 ** 7, + 'zettabytes': 1000 ** 7, + 'zebibytes': 1024 ** 7, 'YiB': 1024 ** 8, 'YB': 1000 ** 8, 'yB': 1024 ** 8, 'Yb': 1000 ** 8, + 'yb': 1000 ** 8, + 'yottabytes': 1000 ** 8, + 'yobibytes': 1024 ** 8, } return lookup_unit_table(_UNIT_TABLE, s) @@ -1560,11 +1631,13 @@ def parse_count(s): return lookup_unit_table(_UNIT_TABLE, s) -def month_by_name(name): +def month_by_name(name, lang='en'): """ Return the number of a month by (locale-independently) English name """ + month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en']) + try: - return ENGLISH_MONTH_NAMES.index(name) + 1 + return month_names.index(name) + 1 except ValueError: return None @@ -1630,6 +1703,20 @@ def url_basename(url): return path.strip('/').split('/')[-1] +def base_url(url): + return re.match(r'https?://[^?#&]+/', url).group() + + +def urljoin(base, path): + if not isinstance(path, compat_str) or not path: + return None + if re.match(r'^(?:https?:)?//', path): + return path + if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base): + return None + return compat_urlparse.urljoin(base, path) + + class HEADRequest(compat_urllib_request.Request): def get_method(self): return 'HEAD' @@ -1686,7 +1773,7 @@ def parse_duration(s): s = s.strip() days, hours, mins, secs, ms = [None] * 5 - m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s) + m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s) if m: days, hours, mins, secs, ms = m.groups() else: @@ -1703,11 +1790,11 @@ def parse_duration(s): )? (?: (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* - )?$''', s) + )?Z?$''', s) if m: days, hours, mins, secs, ms = m.groups() else: - m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s) + m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) if m: hours, mins = m.groups() else: @@ -1757,8 +1844,12 @@ def get_exe_version(exe, args=['--version'], """ Returns the version of the specified executable, or False if the executable is not present """ try: + # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers + # SIGTTOU if youtube-dl is run in the background. + # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656 out, _ = subprocess.Popen( [encodeArgument(exe)] + args, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() except OSError: return False @@ -1983,11 +2074,27 @@ US_RATINGS = { } +TV_PARENTAL_GUIDELINES = { + 'TV-Y': 0, + 'TV-Y7': 7, + 'TV-G': 0, + 'TV-PG': 0, + 'TV-14': 14, + 'TV-MA': 17, +} + + def parse_age_limit(s): - if s is None: + if type(s) == int: + return s if 0 <= s <= 21 else None + if not isinstance(s, compat_basestring): return None m = re.match(r'^(?P<age>\d{1,2})\+?$', s) - return int(m.group('age')) if m else US_RATINGS.get(s) + if m: + return int(m.group('age')) + if s in US_RATINGS: + return US_RATINGS[s] + return TV_PARENTAL_GUIDELINES.get(s) def strip_jsonp(code): @@ -1996,11 +2103,18 @@ def strip_jsonp(code): def js_to_json(code): + COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*' + SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) + INTEGER_TABLE = ( + (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), + (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), + ) + def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v == ',': return "" if v[0] in ("'", '"'): @@ -2011,15 +2125,10 @@ def js_to_json(code): '\\x': '\\u00', }.get(m.group(0), m.group(0)), v[1:-1]) - INTEGER_TABLE = ( - (r'^0[xX][0-9a-fA-F]+', 16), - (r'^0+[0-7]+', 8), - ) - for regex, base in INTEGER_TABLE: im = re.match(regex, v) if im: - i = int(im.group(0), base) + i = int(im.group(1), base) return '"%d":' % i if v.endswith(':') else '%d' % i return '"%s"' % v @@ -2027,11 +2136,11 @@ def js_to_json(code): return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - /\*.*?\*/|,(?=\s*[\]}])| + {comment}|,(?={skip}[\]}}])| [a-zA-Z_][.a-zA-Z_0-9]*| - \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| - [0-9]+(?=\s*:) - ''', fix_kv, code) + \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| + [0-9]+(?={skip}:) + '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) def qualities(quality_ids): @@ -2105,7 +2214,7 @@ def mimetype2ext(mt): return ext _, _, res = mt.rpartition('/') - res = res.lower() + res = res.split(';')[0].strip().lower() return { '3gpp': '3gp', @@ -2125,6 +2234,7 @@ def mimetype2ext(mt): 'f4m+xml': 'f4m', 'hds+xml': 'f4m', 'vnd.ms-sstr+xml': 'ism', + 'quicktime': 'mov', }.get(res, res) @@ -2140,7 +2250,7 @@ def parse_codecs(codecs_str): if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): if not vcodec: vcodec = full_codec - elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'): + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'): if not acodec: acodec = full_codec else: @@ -2263,11 +2373,18 @@ def _match_one(filter_part, dct): m = operator_rex.search(filter_part) if m: op = COMPARISON_OPERATORS[m.group('op')] - if m.group('strval') is not None: + actual_value = dct.get(m.group('key')) + if (m.group('strval') is not None or + # If the original field is a string and matching comparisonvalue is + # a number we should respect the origin of the original field + # and process comparison value as a string (see + # https://github.com/rg3/youtube-dl/issues/11082). + actual_value is not None and m.group('intval') is not None and + isinstance(actual_value, compat_str)): if m.group('op') not in ('=', '!='): raise ValueError( 'Operator %s does not support string values!' % m.group('op')) - comparison_value = m.group('strval') + comparison_value = m.group('strval') or m.group('intval') else: try: comparison_value = int(m.group('intval')) @@ -2279,7 +2396,6 @@ def _match_one(filter_part, dct): raise ValueError( 'Invalid integer value %r in filter part %r' % ( m.group('intval'), filter_part)) - actual_value = dct.get(m.group('key')) if actual_value is None: return m.group('none_inclusive') return op(actual_value, comparison_value) @@ -2392,6 +2508,8 @@ def dfxp2srt(dfxp_data): def cli_option(params, command_option, param): param = params.get(param) + if param: + param = compat_str(param) return [command_option, param] if param is not None else [] @@ -2939,9 +3057,7 @@ def encode_base_n(num, n, table=None): def decode_packed_codes(code): - mobj = re.search( - r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)", - code) + mobj = re.search(PACKED_CODES_RE, code) obfucasted_code, base, count, symbols = mobj.groups() base = int(base) count = int(count) @@ -2969,3 +3085,194 @@ def parse_m3u8_attributes(attrib): def urshift(val, n): return val >> n if val >= 0 else (val + 0x100000000) >> n + + +# Based on png2str() written by @gdkchan and improved by @yokrysty +# Originally posted at https://github.com/rg3/youtube-dl/issues/9706 +def decode_png(png_data): + # Reference: https://www.w3.org/TR/PNG/ + header = png_data[8:] + + if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': + raise IOError('Not a valid PNG file.') + + int_map = {1: '>B', 2: '>H', 4: '>I'} + unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0] + + chunks = [] + + while header: + length = unpack_integer(header[:4]) + header = header[4:] + + chunk_type = header[:4] + header = header[4:] + + chunk_data = header[:length] + header = header[length:] + + header = header[4:] # Skip CRC + + chunks.append({ + 'type': chunk_type, + 'length': length, + 'data': chunk_data + }) + + ihdr = chunks[0]['data'] + + width = unpack_integer(ihdr[:4]) + height = unpack_integer(ihdr[4:8]) + + idat = b'' + + for chunk in chunks: + if chunk['type'] == b'IDAT': + idat += chunk['data'] + + if not idat: + raise IOError('Unable to read PNG data.') + + decompressed_data = bytearray(zlib.decompress(idat)) + + stride = width * 3 + pixels = [] + + def _get_pixel(idx): + x = idx % stride + y = idx // stride + return pixels[y][x] + + for y in range(height): + basePos = y * (1 + stride) + filter_type = decompressed_data[basePos] + + current_row = [] + + pixels.append(current_row) + + for x in range(stride): + color = decompressed_data[1 + basePos + x] + basex = y * stride + x + left = 0 + up = 0 + + if x > 2: + left = _get_pixel(basex - 3) + if y > 0: + up = _get_pixel(basex - stride) + + if filter_type == 1: # Sub + color = (color + left) & 0xff + elif filter_type == 2: # Up + color = (color + up) & 0xff + elif filter_type == 3: # Average + color = (color + ((left + up) >> 1)) & 0xff + elif filter_type == 4: # Paeth + a = left + b = up + c = 0 + + if x > 2 and y > 0: + c = _get_pixel(basex - stride - 3) + + p = a + b - c + + pa = abs(p - a) + pb = abs(p - b) + pc = abs(p - c) + + if pa <= pb and pa <= pc: + color = (color + a) & 0xff + elif pb <= pc: + color = (color + b) & 0xff + else: + color = (color + c) & 0xff + + current_row.append(color) + + return width, height, pixels + + +def write_xattr(path, key, value): + # This mess below finds the best xattr tool for the job + try: + # try the pyxattr module... + import xattr + + if hasattr(xattr, 'set'): # pyxattr + # Unicode arguments are not supported in python-pyxattr until + # version 0.5.0 + # See https://github.com/rg3/youtube-dl/issues/5498 + pyxattr_required_version = '0.5.0' + if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version): + # TODO: fallback to CLI tools + raise XAttrUnavailableError( + 'python-pyxattr is detected but is too old. ' + 'youtube-dl requires %s or above while your version is %s. ' + 'Falling back to other xattr implementations' % ( + pyxattr_required_version, xattr.__version__)) + + setxattr = xattr.set + else: # xattr + setxattr = xattr.setxattr + + try: + setxattr(path, key, value) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) + + except ImportError: + if compat_os_name == 'nt': + # Write xattrs to NTFS Alternate Data Streams: + # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 + assert ':' not in key + assert os.path.exists(path) + + ads_fn = path + ':' + key + try: + with open(ads_fn, 'wb') as f: + f.write(value) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) + else: + user_has_setfattr = check_executable('setfattr', ['--version']) + user_has_xattr = check_executable('xattr', ['-h']) + + if user_has_setfattr or user_has_xattr: + + value = value.decode('utf-8') + if user_has_setfattr: + executable = 'setfattr' + opts = ['-n', key, '-v', value] + elif user_has_xattr: + executable = 'xattr' + opts = ['-w', key, value] + + cmd = ([encodeFilename(executable, True)] + + [encodeArgument(o) for o in opts] + + [encodeFilename(path, True)]) + + try: + p = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) + stdout, stderr = p.communicate() + stderr = stderr.decode('utf-8', 'replace') + if p.returncode != 0: + raise XAttrMetadataError(p.returncode, stderr) + + else: + # On Unix, and can't find pyxattr, setfattr, or xattr. + if sys.platform.startswith('linux'): + raise XAttrUnavailableError( + "Couldn't find a tool to set the xattrs. " + "Install either the python 'pyxattr' or 'xattr' " + "modules, or the GNU 'attr' package " + "(which contains the 'setfattr' tool).") + else: + raise XAttrUnavailableError( + "Couldn't find a tool to set the xattrs. " + "Install either the python 'xattr' module, " + "or the 'xattr' binary.") diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a4654fa59..0f9b6b703 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.06' +__version__ = '2017.02.01'