diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 12de9add2..40a869113 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.11.05 + [debug] youtube-dl version 2020.03.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 8a6202cf6..7b10df3d4 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 83f91d5fe..04bbcfa68 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index be8e70f1e..a9e231817 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.11.05 + [debug] youtube-dl version 2020.03.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 7544d171c..4a3d32d51 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/.travis.yml b/.travis.yml index 14d95fa84..51afd469a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ dist: trusty env: - YTDL_TEST_SET=core - YTDL_TEST_SET=download -matrix: +jobs: include: - python: 3.7 dist: xenial @@ -35,6 +35,11 @@ matrix: env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - env: JYTHON=true; YTDL_TEST_SET=download + - name: flake8 + python: 3.8 + dist: xenial + install: pip install flake8 + script: flake8 . fast_finish: true allow_failures: - env: YTDL_TEST_SET=download diff --git a/ChangeLog b/ChangeLog index d46d20082..f753972c4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,313 @@ +version 2020.03.24 + +Core +- [utils] Revert support for cookie files with spaces used instead of tabs + +Extractors +* [teachable] Update upskillcourses and gns3 domains +* [generic] Look for teachable embeds before wistia ++ [teachable] Extract chapter metadata (#24421) ++ [bilibili] Add support for player.bilibili.com (#24402) ++ [bilibili] Add support for new URL schema with BV ids (#24439, #24442) +* [limelight] Remove disabled API requests (#24255) +* [soundcloud] Fix download URL extraction (#24394) ++ [cbc:watch] Add support for authentication (#19160) +* [hellporno] Fix extraction (#24399) +* [xtube] Fix formats extraction (#24348) +* [ndr] Fix extraction (#24326) +* [nhk] Update m3u8 URL and use native HLS downloader (#24329) +- [nhk] Remove obsolete rtmp formats (#24329) +* [nhk] Relax URL regular expression (#24329) +- [vimeo] Revert fix showcase password protected video extraction (#24224) + + +version 2020.03.08 + +Core ++ [utils] Add support for cookie files with spaces used instead of tabs + +Extractors ++ [pornhub] Add support for pornhubpremium.com (#24288) +- [youtube] Remove outdated code and unnecessary requests +* [youtube] Improve extraction in 429 HTTP error conditions (#24283) +* [nhk] Update API version (#24270) + + +version 2020.03.06 + +Extractors +* [youtube] Fix age-gated videos support without login (#24248) +* [vimeo] Fix showcase password protected video extraction (#24224) +* [pornhub] Improve title extraction (#24184) +* [peertube] Improve extraction (#23657) ++ [servus] Add support for new URL schema (#23475, #23583, #24142) +* [vimeo] Fix subtitles URLs (#24209) + + +version 2020.03.01 + +Core +* [YoutubeDL] Force redirect URL to unicode on python 2 +- [options] Remove duplicate short option -v for --version (#24162) + +Extractors +* [xhamster] Fix extraction (#24205) +* [franceculture] Fix extraction (#24204) ++ [telecinco] Add support for article opening videos +* [telecinco] Fix extraction (#24195) +* [xtube] Fix metadata extraction (#21073, #22455) +* [youjizz] Fix extraction (#24181) +- Remove no longer needed compat_str around geturl +* [pornhd] Fix extraction (#24128) ++ [teachable] Add support for multiple videos per lecture (#24101) ++ [wistia] Add support for multiple generic embeds (#8347, 11385) +* [imdb] Fix extraction (#23443) +* [tv2dk:bornholm:play] Fix extraction (#24076) + + +version 2020.02.16 + +Core +* [YoutubeDL] Fix playlist entry indexing with --playlist-items (#10591, + #10622) +* [update] Fix updating via symlinks (#23991) ++ [compat] Introduce compat_realpath (#23991) + +Extractors ++ [npr] Add support for streams (#24042) ++ [24video] Add support for porn.24video.net (#23779, #23784) +- [jpopsuki] Remove extractor (#23858) +* [nova] Improve extraction (#23690) +* [nova:embed] Improve (#23690) +* [nova:embed] Fix extraction (#23672) ++ [abc:iview] Add support for 720p (#22907, #22921) +* [nytimes] Improve format sorting (#24010) ++ [toggle] Add support for mewatch.sg (#23895, #23930) +* [thisoldhouse] Fix extraction (#23951) ++ [popcorntimes] Add support for popcorntimes.tv (#23949) +* [sportdeutschland] Update to new API +* [twitch:stream] Lowercase channel id for stream request (#23917) +* [tv5mondeplus] Fix extraction (#23907, #23911) +* [tva] Relax URL regular expression (#23903) +* [vimeo] Fix album extraction (#23864) +* [viewlift] Improve extraction + * Fix extraction (#23851) + + Add support for authentication + + Add support for more domains +* [svt] Fix series extraction (#22297) +* [svt] Fix article extraction (#22897, #22919) +* [soundcloud] Imporve private playlist/set tracks extraction (#3707) + + +version 2020.01.24 + +Extractors +* [youtube] Fix sigfunc name extraction (#23819) +* [stretchinternet] Fix extraction (#4319) +* [voicerepublic] Fix extraction +* [azmedien] Fix extraction (#23783) +* [businessinsider] Fix jwplatform id extraction (#22929, #22954) ++ [24video] Add support for 24video.vip (#23753) +* [ivi:compilation] Fix entries extraction (#23770) +* [ard] Improve extraction (#23761) + * Simplify extraction + + Extract age limit and series + * Bypass geo-restriction ++ [nbc] Add support for nbc multi network URLs (#23049) +* [americastestkitchen] Fix extraction +* [zype] Improve extraction + + Extract subtitles (#21258) + + Support URLs with alternative keys/tokens (#21258) + + Extract more metadata +* [orf:tvthek] Improve geo restricted videos detection (#23741) +* [soundcloud] Restore previews extraction (#23739) + + +version 2020.01.15 + +Extractors +* [yourporn] Fix extraction (#21645, #22255, #23459) ++ [canvas] Add support for new API endpoint (#17680, #18629) +* [ndr:base:embed] Improve thumbnails extraction (#23731) ++ [vodplatform] Add support for embed.kwikmotion.com domain ++ [twitter] Add support for promo_video_website cards (#23711) +* [orf:radio] Clean description and improve extraction +* [orf:fm4] Fix extraction (#23599) +* [safari] Fix kaltura session extraction (#23679, #23670) +* [lego] Fix extraction and extract subtitle (#23687) +* [cloudflarestream] Improve extraction + + Add support for bytehighway.net domain + + Add support for signed URLs + + Extract thumbnail +* [naver] Improve extraction + * Improve geo-restriction handling + + Extract automatic captions + + Extract uploader metadata + + Extract VLive HLS formats + * Improve metadata extraction +- [pandatv] Remove extractor (#23630) +* [dctp] Fix format extraction (#23656) ++ [scrippsnetworks] Add support for www.discovery.com videos +* [discovery] Fix anonymous token extraction (#23650) +* [nrktv:seriebase] Fix extraction (#23625, #23537) +* [wistia] Improve format extraction and extract subtitles (#22590) +* [vice] Improve extraction (#23631) +* [redtube] Detect private videos (#23518) + + +version 2020.01.01 + +Extractors +* [brightcove] Invalidate policy key cache on failing requests +* [pornhub] Improve locked videos detection (#22449, #22780) ++ [pornhub] Add support for m3u8 formats +* [pornhub] Fix extraction (#22749, #23082) +* [brightcove] Update policy key on failing requests +* [spankbang] Improve removed video detection (#23423) +* [spankbang] Fix extraction (#23307, #23423, #23444) +* [soundcloud] Automatically update client id on failing requests +* [prosiebensat1] Improve geo restriction handling (#23571) +* [brightcove] Cache brightcove player policy keys +* [teachable] Fail with error message if no video URL found +* [teachable] Improve locked lessons detection (#23528) ++ [scrippsnetworks] Add support for Scripps Networks sites (#19857, #22981) +* [mitele] Fix extraction (#21354, #23456) +* [soundcloud] Update client id (#23516) +* [mailru] Relax URL regular expressions (#23509) + + +version 2019.12.25 + +Core +* [utils] Improve str_to_int ++ [downloader/hls] Add ability to override AES decryption key URL (#17521) + +Extractors +* [mediaset] Fix parse formats (#23508) ++ [tv2dk:bornholm:play] Add support for play.tv2bornholm.dk (#23291) ++ [slideslive] Add support for url and vimeo service names (#23414) +* [slideslive] Fix extraction (#23413) +* [twitch:clips] Fix extraction (#23375) ++ [soundcloud] Add support for token protected embeds (#18954) +* [vk] Improve extraction + * Fix User Videos extraction (#23356) + * Extract all videos for lists with more than 1000 videos (#23356) + + Add support for video albums (#14327, #14492) +- [kontrtube] Remove extractor +- [videopremium] Remove extractor +- [musicplayon] Remove extractor (#9225) ++ [ufctv] Add support for ufcfightpass.imgdge.com and + ufcfightpass.imggaming.com (#23343) ++ [twitch] Extract m3u8 formats frame rate (#23333) ++ [imggaming] Add support for playlists and extract subtitles ++ [ufcarabia] Add support for UFC Arabia (#23312) +* [ufctv] Fix extraction +* [yahoo] Fix gyao brightcove player id (#23303) +* [vzaar] Override AES decryption key URL (#17521) ++ [vzaar] Add support for AES HLS manifests (#17521, #23299) +* [nrl] Fix extraction +* [teachingchannel] Fix extraction +* [nintendo] Fix extraction and partially add support for Nintendo Direct + videos (#4592) ++ [ooyala] Add better fallback values for domain and streams variables ++ [youtube] Add support youtubekids.com (#23272) +* [tv2] Detect DRM protection ++ [tv2] Add support for katsomo.fi and mtv.fi (#10543) +* [tv2] Fix tv2.no article extraction +* [msn] Improve extraction + + Add support for YouTube and NBCSports embeds + + Add support for articles with multiple videos + * Improve AOL embed support + * Improve format extraction +* [abcotvs] Relax URL regular expression and improve metadata extraction + (#18014) +* [channel9] Reduce response size +* [adobetv] Improve extaction + * Use OnDemandPagedList for list extractors + * Reduce show extraction requests + * Extract original video format and subtitles + + Add support for adobe tv embeds + + +version 2019.11.28 + +Core ++ [utils] Add generic caesar cipher and rot47 +* [utils] Handle rd-suffixed day parts in unified_strdate (#23199) + +Extractors +* [vimeo] Improve extraction + * Fix review extraction + * Fix ondemand extraction + * Make password protected player case as an expected error (#22896) + * Simplify channel based extractors code +- [openload] Remove extractor (#11999) +- [verystream] Remove extractor +- [streamango] Remove extractor (#15406) +* [dailymotion] Improve extraction + * Extract http formats included in m3u8 manifest + * Fix user extraction (#3553, #21415) + + Add suport for User Authentication (#11491) + * Fix password protected videos extraction (#23176) + * Respect age limit option and family filter cookie value (#18437) + * Handle video url playlist query param + * Report allowed countries for geo-restricted videos +* [corus] Improve extraction + + Add support for Series Plus, W Network, YTV, ABC Spark, disneychannel.com + and disneylachaine.ca (#20861) + + Add support for self hosted videos (#22075) + * Detect DRM protection (#14910, #9164) +* [vivo] Fix extraction (#22328, #22279) ++ [bitchute] Extract upload date (#22990, #23193) +* [soundcloud] Update client id (#23214) + + +version 2019.11.22 + +Core ++ [extractor/common] Clean jwplayer description HTML tags ++ [extractor/common] Add data, headers and query to all major extract formats + methods + +Extractors +* [chaturbate] Fix extraction (#23010, #23012) ++ [ntvru] Add support for non relative file URLs (#23140) +* [vk] Fix wall audio thumbnails extraction (#23135) +* [ivi] Fix format extraction (#21991) +- [comcarcoff] Remove extractor ++ [drtv] Add support for new URL schema (#23059) ++ [nexx] Add support for Multi Player JS Setup (#23052) ++ [teamcoco] Add support for new videos (#23054) +* [soundcloud] Check if the soundtrack has downloads left (#23045) +* [facebook] Fix posts video data extraction (#22473) +- [addanime] Remove extractor +- [minhateca] Remove extractor +- [daisuki] Remove extractor +* [seeker] Fix extraction +- [revision3] Remove extractors +* [twitch] Fix video comments URL (#18593, #15828) +* [twitter] Improve extraction + + Add support for generic embeds (#22168) + * Always extract http formats for native videos (#14934) + + Add support for Twitter Broadcasts (#21369) + + Extract more metadata + * Improve VMap format extraction + * Unify extraction code for both twitter statuses and cards ++ [twitch] Add support for Clip embed URLs +* [lnkgo] Fix extraction (#16834) +* [mixcloud] Improve extraction + * Improve metadata extraction (#11721) + * Fix playlist extraction (#22378) + * Fix user mixes extraction (#15197, #17865) ++ [kinja] Add support for Kinja embeds (#5756, #11282, #22237, #22384) +* [onionstudios] Fix extraction ++ [hotstar] Pass Referer header to format requests (#22836) +* [dplay] Minimize response size ++ [patreon] Extract uploader_id and filesize +* [patreon] Minimize response size +* [roosterteeth] Fix login request (#16094, #22689) + + version 2019.11.05 Extractors @@ -504,7 +814,7 @@ Extractors version 2019.04.17 Extractors -* [openload] Randomize User-Agent (closes #20688) +* [openload] Randomize User-Agent (#20688) + [openload] Add support for oladblock domains (#20471) * [adn] Fix subtitle extraction (#12724) + [aol] Add support for localized websites @@ -1069,7 +1379,7 @@ Extractors + [youtube] Extract channel meta fields (#9676, #12939) * [porntube] Fix extraction (#17541) * [asiancrush] Fix extraction (#15630) -+ [twitch:clips] Extend URL regular expression (closes #17559) ++ [twitch:clips] Extend URL regular expression (#17559) + [vzaar] Add support for HLS * [tube8] Fix metadata extraction (#17520) * [eporner] Extract JSON-LD (#17519) diff --git a/README.md b/README.md index 01f975958..4f54a5240 100644 --- a/README.md +++ b/README.md @@ -835,7 +835,9 @@ In February 2015, the new YouTube player contained a character sequence in a str ### HTTP Error 429: Too Many Requests or 402: Payment Required -These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. +These two error codes indicate that the service is blocking your IP address because of overuse. Usually this is a soft block meaning that you can gain access again after solving CAPTCHA. Just open a browser and solve a CAPTCHA the service suggests you and after that [pass cookies](#how-do-i-pass-cookies-to-youtube-dl) to youtube-dl. Note that if your machine has multiple external IPs then you should also pass exactly the same IP you've used for solving CAPTCHA with [`--source-address`](#network-options). Also you may need to pass a `User-Agent` HTTP header of your browser with [`--user-agent`](#workarounds). + +If this is not the case (no CAPTCHA suggested to solve by the service) then you can contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. ### SyntaxError: Non-ASCII character diff --git a/devscripts/create-github-release.py b/devscripts/create-github-release.py index 428111b3f..2ddfa1096 100644 --- a/devscripts/create-github-release.py +++ b/devscripts/create-github-release.py @@ -1,7 +1,6 @@ #!/usr/bin/env python from __future__ import unicode_literals -import base64 import io import json import mimetypes @@ -15,7 +14,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.compat import ( compat_basestring, - compat_input, compat_getpass, compat_print, compat_urllib_request, @@ -40,28 +38,20 @@ class GitHubReleaser(object): try: info = netrc.netrc().authenticators(self._NETRC_MACHINE) if info is not None: - self._username = info[0] - self._password = info[2] + self._token = info[2] compat_print('Using GitHub credentials found in .netrc...') return else: compat_print('No GitHub credentials found in .netrc') except (IOError, netrc.NetrcParseError): compat_print('Unable to parse .netrc') - self._username = compat_input( - 'Type your GitHub username or email address and press [Return]: ') - self._password = compat_getpass( - 'Type your GitHub password and press [Return]: ') + self._token = compat_getpass( + 'Type your GitHub PAT (personal access token) and press [Return]: ') def _call(self, req): if isinstance(req, compat_basestring): req = sanitized_Request(req) - # Authorizing manually since GitHub does not response with 401 with - # WWW-Authenticate header set (see - # https://developer.github.com/v3/#basic-authentication) - b64 = base64.b64encode( - ('%s:%s' % (self._username, self._password)).encode('utf-8')).decode('ascii') - req.add_header('Authorization', 'Basic %s' % b64) + req.add_header('Authorization', 'token %s' % self._token) response = self._opener.open(req).read().decode('utf-8') return json.loads(response) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 536b87479..174b83bf3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -26,13 +26,13 @@ - **AcademicEarth:Course** - **acast** - **acast:channel** - - **AddAnime** - **ADN**: Anime Digital Network - **AdobeConnect** - - **AdobeTV** - - **AdobeTVChannel** - - **AdobeTVShow** - - **AdobeTVVideo** + - **adobetv** + - **adobetv:channel** + - **adobetv:embed** + - **adobetv:show** + - **adobetv:video** - **AdultSwim** - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault - **afreecatv**: afreecatv.com @@ -98,6 +98,7 @@ - **BiliBili** - **BilibiliAudio** - **BilibiliAudioAlbum** + - **BiliBiliPlayer** - **BioBioChileTV** - **BIQLE** - **BitChute** @@ -175,7 +176,6 @@ - **CNN** - **CNNArticle** - **CNNBlogs** - - **ComCarCoff** - **ComedyCentral** - **ComedyCentralFullEpisodes** - **ComedyCentralShortname** @@ -203,8 +203,6 @@ - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** - - **DaisukiMotto** - - **DaisukiMottoPlaylist** - **daum.net** - **daum.net:clip** - **daum.net:playlist** @@ -392,7 +390,6 @@ - **JeuxVideo** - **Joj** - **Jove** - - **jpopsuki.tv** - **JWPlatform** - **Kakao** - **Kaltura** @@ -400,13 +397,14 @@ - **Kankan** - **Karaoketv** - **KarriereVideos** + - **Katsomo** - **KeezMovies** - **Ketnet** - **KhanAcademy** - **KickStarter** + - **KinjaEmbed** - **KinoPoisk** - **KonserthusetPlay** - - **kontrtube**: KontrTube.ru - Труба зовёт - **KrasView**: Красвью - **Ku6** - **KUSI** @@ -485,14 +483,12 @@ - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** - - **Minhateca** - **MinistryGrid** - **Minoto** - **miomio.tv** - **MiTele**: mitele.es - **mixcloud** - **mixcloud:playlist** - - **mixcloud:stream** - **mixcloud:user** - **Mixer:live** - **Mixer:vod** @@ -518,7 +514,6 @@ - **mtvjapan** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - - **MusicPlayOn** - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses - **Mwave** @@ -623,7 +618,6 @@ - **OnionStudios** - **Ooyala** - **OoyalaExternal** - - **Openload** - **OraTV** - **orf:fm4**: radio FM4 - **orf:fm4:story**: fm4.orf.at stories @@ -634,7 +628,6 @@ - **OutsideTV** - **PacktPub** - **PacktPubCourse** - - **PandaTV**: 熊猫TV - **pandora.tv**: 판도라TV - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos @@ -670,6 +663,7 @@ - **Pokemon** - **PolskieRadio** - **PolskieRadioCategory** + - **Popcorntimes** - **PopcornTV** - **PornCom** - **PornerBros** @@ -723,8 +717,6 @@ - **Restudy** - **Reuters** - **ReverbNation** - - **revision** - - **revision3:embed** - **RICE** - **RMCDecouverte** - **RockstarGames** @@ -769,6 +761,7 @@ - **screen.yahoo:search**: Yahoo screen search - **Screencast** - **ScreencastOMatic** + - **ScrippsNetworks** - **scrippsnetworks:watch** - **SCTE** - **SCTECourse** @@ -832,7 +825,6 @@ - **Steam** - **Stitcher** - **Streamable** - - **Streamango** - **streamcloud.eu** - **StreamCZ** - **StreetVoice** @@ -922,6 +914,7 @@ - **tv2.hu** - **TV2Article** - **TV2DK** + - **TV2DKBornholmPlay** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ - **TVA** @@ -958,10 +951,12 @@ - **twitch:vod** - **twitter** - **twitter:amplify** + - **twitter:broadcast** - **twitter:card** - **udemy** - **udemy:course** - **UDNEmbed**: 聯合影音 + - **UFCArabia** - **UFCTV** - **UKTVPlay** - **umg:de**: Universal Music Deutschland @@ -982,7 +977,6 @@ - **Vbox7** - **VeeHD** - **Veoh** - - **verystream** - **Vesti**: Вести.Ru - **Vevo** - **VevoPlaylist** @@ -1002,7 +996,6 @@ - **videomore** - **videomore:season** - **videomore:video** - - **VideoPremium** - **VideoPress** - **Vidio** - **VidLii** @@ -1012,8 +1005,8 @@ - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - - **ViewLift** - - **ViewLiftEmbed** + - **viewlift** + - **viewlift:embed** - **Viidea** - **viki** - **viki:channel** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ce9666171..1e204e551 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -816,11 +816,15 @@ class TestYoutubeDL(unittest.TestCase): 'webpage_url': 'http://example.com', } - def get_ids(params): + def get_downloaded_info_dicts(params): ydl = YDL(params) - # make a copy because the dictionary can be modified - ydl.process_ie_result(playlist.copy()) - return [int(v['id']) for v in ydl.downloaded_info_dicts] + # make a deep copy because the dictionary and nested entries + # can be modified + ydl.process_ie_result(copy.deepcopy(playlist)) + return ydl.downloaded_info_dicts + + def get_ids(params): + return [int(v['id']) for v in get_downloaded_info_dicts(params)] result = get_ids({}) self.assertEqual(result, [1, 2, 3, 4]) @@ -852,6 +856,22 @@ class TestYoutubeDL(unittest.TestCase): result = get_ids({'playlist_items': '2-4,3-4,3'}) self.assertEqual(result, [2, 3, 4]) + # Tests for https://github.com/ytdl-org/youtube-dl/issues/10591 + # @{ + result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'}) + self.assertEqual(result[0]['playlist_index'], 2) + self.assertEqual(result[1]['playlist_index'], 3) + + result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'}) + self.assertEqual(result[0]['playlist_index'], 2) + self.assertEqual(result[1]['playlist_index'], 3) + self.assertEqual(result[2]['playlist_index'], 4) + + result = get_downloaded_info_dicts({'playlist_items': '4,2'}) + self.assertEqual(result[0]['playlist_index'], 4) + self.assertEqual(result[1]['playlist_index'], 2) + # @} + def test_urlopen_no_file_protocol(self): # see https://github.com/ytdl-org/youtube-dl/issues/8227 ydl = YDL() diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 7d57a628e..17aaaf20d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -26,7 +26,6 @@ from youtube_dl.extractor import ( ThePlatformIE, ThePlatformFeedIE, RTVEALaCartaIE, - FunnyOrDieIE, DemocracynowIE, ) @@ -322,18 +321,6 @@ class TestRtveSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') -class TestFunnyOrDieSubtitles(BaseTestSubtitles): - url = 'http://www.funnyordie.com/videos/224829ff6d/judd-apatow-will-direct-your-vine' - IE = FunnyOrDieIE - - def test_allsubtitles(self): - self.DL.params['writesubtitles'] = True - self.DL.params['allsubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') - - class TestDemocracynowSubtitles(BaseTestSubtitles): url = 'http://www.democracynow.org/shows/2015/7/3' IE = DemocracynowIE diff --git a/test/test_utils.py b/test/test_utils.py index 3920542bb..0896f4150 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -19,6 +19,7 @@ from youtube_dl.utils import ( age_restricted, args_to_str, encode_base_n, + caesar, clean_html, date_from_str, DateRange, @@ -69,6 +70,7 @@ from youtube_dl.utils import ( remove_start, remove_end, remove_quotes, + rot47, shell_quote, smuggle_url, str_to_int, @@ -340,6 +342,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_strdate('July 15th, 2013'), '20130715') self.assertEqual(unified_strdate('September 1st, 2013'), '20130901') self.assertEqual(unified_strdate('Sep 2nd, 2013'), '20130902') + self.assertEqual(unified_strdate('November 3rd, 2019'), '20191103') + self.assertEqual(unified_strdate('October 23rd, 2005'), '20051023') def test_unified_timestamps(self): self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600) @@ -495,6 +499,12 @@ class TestUtil(unittest.TestCase): def test_str_to_int(self): self.assertEqual(str_to_int('123,456'), 123456) self.assertEqual(str_to_int('123.456'), 123456) + self.assertEqual(str_to_int(523), 523) + # Python 3 has no long + if sys.version_info < (3, 0): + eval('self.assertEqual(str_to_int(123456L), 123456)') + self.assertEqual(str_to_int('noninteger'), None) + self.assertEqual(str_to_int([]), None) def test_url_basename(self): self.assertEqual(url_basename('http://foo.de/'), '') @@ -1367,6 +1377,20 @@ Line 1 self.assertRaises(ValueError, encode_base_n, 0, 70) self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table) + def test_caesar(self): + self.assertEqual(caesar('ace', 'abcdef', 2), 'cea') + self.assertEqual(caesar('cea', 'abcdef', -2), 'ace') + self.assertEqual(caesar('ace', 'abcdef', -2), 'eac') + self.assertEqual(caesar('eac', 'abcdef', 2), 'ace') + self.assertEqual(caesar('ace', 'abcdef', 0), 'ace') + self.assertEqual(caesar('xyz', 'abcdef', 2), 'xyz') + self.assertEqual(caesar('abc', 'acegik', 2), 'ebg') + self.assertEqual(caesar('ebg', 'acegik', -2), 'abc') + + def test_rot47(self): + self.assertEqual(rot47('youtube-dl'), r'J@FEF36\5=') + self.assertEqual(rot47('YOUTUBE-DL'), r'*~&%&qt\s{') + def test_urshift(self): self.assertEqual(urshift(3, 1), 1) self.assertEqual(urshift(-3, 1), 2147483646) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f5cb46308..19370f62b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -92,6 +92,7 @@ from .utils import ( YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, + YoutubeDLRedirectHandler, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER @@ -990,7 +991,7 @@ class YoutubeDL(object): 'playlist_title': ie_result.get('title'), 'playlist_uploader': ie_result.get('uploader'), 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': i + playliststart, + 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], 'webpage_url_basename': url_basename(ie_result['webpage_url']), @@ -2343,6 +2344,7 @@ class YoutubeDL(object): debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + redirect_handler = YoutubeDLRedirectHandler() data_handler = compat_urllib_request_DataHandler() # When passing our own FileHandler instance, build_opener won't add the @@ -2356,7 +2358,7 @@ class YoutubeDL(object): file_handler.file_open = file_open opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler) + proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index c75ab131b..d1b86bd13 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2754,6 +2754,17 @@ else: compat_expanduser = os.path.expanduser +if compat_os_name == 'nt' and sys.version_info < (3, 8): + # os.path.realpath on Windows does not follow symbolic links + # prior to Python 3.8 (see https://bugs.python.org/issue9949) + def compat_realpath(path): + while os.path.islink(path): + path = os.path.abspath(os.readlink(path)) + return path +else: + compat_realpath = os.path.realpath + + if sys.version_info < (3, 0): def compat_print(s): from .utils import preferredencoding @@ -2998,6 +3009,7 @@ __all__ = [ 'compat_os_name', 'compat_parse_qs', 'compat_print', + 'compat_realpath', 'compat_setenv', 'compat_shlex_quote', 'compat_shlex_split', diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index b59aad73f..84bc34928 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -64,7 +64,7 @@ class HlsFD(FragmentFD): s = urlh.read().decode('utf-8', 'ignore') if not self.can_download(s, info_dict): - if info_dict.get('extra_param_to_segment_url'): + if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'): self.report_error('pycrypto not found. Please install it.') return False self.report_warning( @@ -169,7 +169,7 @@ class HlsFD(FragmentFD): if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( - self._prepare_url(info_dict, decrypt_info['URI'])).read() + self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() frag_content = AES.new( decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 4ac323bf6..6637f4f35 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -110,17 +110,17 @@ class ABCIViewIE(InfoExtractor): # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'https://iview.abc.net.au/show/ben-and-hollys-little-kingdom/series/0/video/ZX9371A050S00', - 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', + 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', + 'md5': '67715ce3c78426b11ba167d875ac6abf', 'info_dict': { - 'id': 'ZX9371A050S00', + 'id': 'LE1927H001S00', 'ext': 'mp4', - 'title': "Gaston's Birthday", - 'series': "Ben And Holly's Little Kingdom", - 'description': 'md5:f9de914d02f226968f598ac76f105bcf', - 'upload_date': '20180604', - 'uploader_id': 'abc4kids', - 'timestamp': 1528140219, + 'title': "Series 11 Ep 1", + 'series': "Gruen", + 'description': 'md5:52cc744ad35045baf6aded2ce7287f67', + 'upload_date': '20190925', + 'uploader_id': 'abc1', + 'timestamp': 1569445289, }, 'params': { 'skip_download': True, @@ -148,7 +148,7 @@ class ABCIViewIE(InfoExtractor): 'hdnea': token, }) - for sd in ('sd', 'sd-low'): + for sd in ('720', 'sd', 'sd-low'): sd_url = try_get( stream, lambda x: x['streams']['hls'][sd], compat_str) if not sd_url: diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py index 03b92a39c..0bc69a64f 100644 --- a/youtube_dl/extractor/abcotvs.py +++ b/youtube_dl/extractor/abcotvs.py @@ -4,29 +4,30 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + dict_get, int_or_none, - parse_iso8601, + try_get, ) class ABCOTVSIE(InfoExtractor): IE_NAME = 'abcotvs' IE_DESC = 'ABC Owned Television Stations' - _VALID_URL = r'https?://(?:abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' + _VALID_URL = r'https?://(?Pabc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P[^/]+))?/(?P\d+)' _TESTS = [ { 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', 'info_dict': { - 'id': '472581', + 'id': '472548', 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', 'ext': 'mp4', - 'title': 'East Bay museum celebrates vintage synthesizers', + 'title': 'East Bay museum celebrates synthesized music', 'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3', 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421123075, + 'timestamp': 1421118520, 'upload_date': '20150113', - 'uploader': 'Jonathan Bloom', }, 'params': { # m3u8 download @@ -37,39 +38,63 @@ class ABCOTVSIE(InfoExtractor): 'url': 'http://abc7news.com/472581', 'only_matching': True, }, + { + 'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/', + 'only_matching': True, + }, ] + _SITE_MAP = { + '6abc': 'wpvi', + 'abc11': 'wtvd', + 'abc13': 'ktrk', + 'abc30': 'kfsn', + 'abc7': 'kabc', + 'abc7chicago': 'wls', + 'abc7news': 'kgo', + 'abc7ny': 'wabc', + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + site, display_id, video_id = re.match(self._VALID_URL, url).groups() + display_id = display_id or video_id + station = self._SITE_MAP[site] - webpage = self._download_webpage(url, display_id) + data = self._download_json( + 'https://api.abcotvs.com/v2/content', display_id, query={ + 'id': video_id, + 'key': 'otv.web.%s.story' % station, + 'station': station, + })['data'] + video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data + video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id)) + title = video.get('title') or video['linkText'] - m3u8 = self._html_search_meta( - 'contentURL', webpage, 'm3u8 url', fatal=True).split('?')[0] - - formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') + formats = [] + m3u8_url = video.get('m3u8') + if m3u8_url: + formats = self._extract_m3u8_formats( + video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False) + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'abr': 128, + 'format_id': 'https', + 'height': 360, + 'url': mp4_url, + 'width': 640, + }) self._sort_formats(formats) - title = self._og_search_title(webpage).strip() - description = self._og_search_description(webpage).strip() - thumbnail = self._og_search_thumbnail(webpage) - timestamp = parse_iso8601(self._search_regex( - r'
\s*
', webpage, + 'description', fatal=False) + + thumbnail = self._search_regex( + r']+class=["\']video-preview[^>]+\bsrc=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'thumbnail', default=None, + group='value') or self._og_search_thumbnail(webpage) + + creator = self._html_search_meta( + 'video:director', webpage, 'creator', default=None) + + release_date = self._html_search_meta( + 'video:release_date', webpage, default=None) + if release_date: + release_date = release_date.replace('-', '') + + def int_meta(name): + return int_or_none(self._html_search_meta( + name, webpage, default=None)) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'creator': creator, + 'release_date': release_date, + 'duration': int_meta('video:duration'), + 'tbr': int_meta('ya:ovs:bitrate'), + 'width': int_meta('og:video:width'), + 'height': int_meta('og:video:height'), + 'http_headers': { + 'Referer': url, + }, + } diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 27d65d4b9..c6052ac9f 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -8,6 +8,7 @@ from ..utils import ( ExtractorError, int_or_none, js_to_json, + merge_dicts, urljoin, ) @@ -27,23 +28,22 @@ class PornHdIE(InfoExtractor): 'view_count': int, 'like_count': int, 'age_limit': 18, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { - # removed video 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'md5': '956b8ca569f7f4d8ec563e2c41598441', + 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', 'info_dict': { 'id': '1962', 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'ext': 'mp4', - 'title': 'Sierra loves doing laundry', + 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, 'like_count': int, 'age_limit': 18, }, - 'skip': 'Not available anymore', }] def _real_extract(self, url): @@ -61,7 +61,13 @@ class PornHdIE(InfoExtractor): r"(?s)sources'?\s*[:=]\s*(\{.+?\})", webpage, 'sources', default='{}')), video_id) + info = {} if not sources: + entries = self._parse_html5_media_entries(url, webpage, video_id) + if entries: + info = entries[0] + + if not sources and not info: message = self._html_search_regex( r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class="description"[^>]*>(?P[^<]+)]+class=["\']video-description[^>]+>(?P.+?)', + r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)(?:(?!\1).)+)\1", webpage, - 'thumbnail', fatal=False, group='url') + 'thumbnail', default=None, group='url') like_count = int_or_none(self._search_regex( - (r'(\d+)\s*]+>(?: |\s)*\blikes', + (r'(\d+)\s*likes', + r'(\d+)\s*]+>(?: |\s)*\blikes', r'class=["\']save-count["\'][^>]*>\s*(\d+)'), webpage, 'like count', fatal=False)) - return { + return merge_dicts(info, { 'id': video_id, 'display_id': display_id, 'title': title, @@ -106,4 +118,4 @@ class PornHdIE(InfoExtractor): 'like_count': like_count, 'formats': formats, 'age_limit': 18, - } + }) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index ba0ad7da2..3567a3283 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -17,6 +17,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + NO_DEFAULT, orderedSet, remove_quotes, str_to_int, @@ -51,7 +52,7 @@ class PornHubIE(PornHubBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?(?Ppornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) @@ -148,6 +149,9 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', + 'only_matching': True, }] @staticmethod @@ -165,6 +169,13 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') + if 'premium' in host: + if not self._downloader.params.get('cookiefile'): + raise ExtractorError( + 'PornHub Premium requires authentication.' + ' You may want to use --cookies.', + expected=True) + self._set_cookie(host, 'age_verified', '1') def dl_webpage(platform): @@ -188,10 +199,10 @@ class PornHubIE(PornHubBaseIE): # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. title = self._html_search_meta( - 'twitter:title', webpage, default=None) or self._search_regex( - (r']+class=["\']title["\'][^>]*>(?P[^<]+)', - r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', - r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), + 'twitter:title', webpage, default=None) or self._html_search_regex( + (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>', + r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), webpage, 'title', group='title') video_urls = [] @@ -227,12 +238,13 @@ class PornHubIE(PornHubBaseIE): else: thumbnail, duration = [None] * 2 - if not video_urls: - tv_webpage = dl_webpage('tv') - + def extract_js_vars(webpage, pattern, default=NO_DEFAULT): assignments = self._search_regex( - r'(var.+?mediastring.+?)</script>', tv_webpage, - 'encoded url').split(';') + pattern, webpage, 'encoded url', default=default) + if not assignments: + return {} + + assignments = assignments.split(';') js_vars = {} @@ -254,11 +266,35 @@ class PornHubIE(PornHubBaseIE): assn = re.sub(r'var\s+', '', assn) vname, value = assn.split('=', 1) js_vars[vname] = parse_js_value(value) + return js_vars - video_url = js_vars['mediastring'] - if video_url not in video_urls_set: - video_urls.append((video_url, None)) - video_urls_set.add(video_url) + def add_video_url(video_url): + v_url = url_or_none(video_url) + if not v_url: + return + if v_url in video_urls_set: + return + video_urls.append((v_url, None)) + video_urls_set.add(v_url) + + if not video_urls: + FORMAT_PREFIXES = ('media', 'quality') + js_vars = extract_js_vars( + webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), + default=None) + if js_vars: + for key, format_url in js_vars.items(): + if any(key.startswith(p) for p in FORMAT_PREFIXES): + add_video_url(format_url) + if not video_urls and re.search( + r'<[^>]+\bid=["\']lockedPlayer', webpage): + raise ExtractorError( + 'Video %s is locked' % video_id, expected=True) + + if not video_urls: + js_vars = extract_js_vars( + dl_webpage('tv'), r'(var.+?mediastring.+?)</script>') + add_video_url(js_vars['mediastring']) for mobj in re.finditer( r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', @@ -276,10 +312,16 @@ class PornHubIE(PornHubBaseIE): r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) if upload_date: upload_date = upload_date.replace('/', '') - if determine_ext(video_url) == 'mpd': + ext = determine_ext(video_url) + if ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id='dash', fatal=False)) continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue tbr = None mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) if mobj: @@ -373,7 +415,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -441,7 +483,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -556,7 +598,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index e19a470a5..1bc4f9b6b 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -16,7 +16,7 @@ from ..utils import ( class ProSiebenSat1BaseIE(InfoExtractor): - _GEO_COUNTRIES = ['DE'] + _GEO_BYPASS = False _ACCESS_ID = None _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' @@ -39,14 +39,18 @@ class ProSiebenSat1BaseIE(InfoExtractor): formats = [] if self._ACCESS_ID: raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID - server_token = (self._download_json( + protocols = self._download_json( self._V4_BASE_URL + 'protocols', clip_id, 'Downloading protocols JSON', headers=self.geo_verification_headers(), query={ 'access_id': self._ACCESS_ID, 'client_token': sha1((raw_ct).encode()).hexdigest(), 'video_id': clip_id, - }, fatal=False) or {}).get('server_token') + }, fatal=False, expected_status=(403,)) or {} + error = protocols.get('error') or {} + if error.get('title') == 'Geo check failed': + self.raise_geo_restricted(countries=['AT', 'CH', 'DE']) + server_token = protocols.get('server_token') if server_token: urls = (self._download_json( self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 5c84028ef..b1bde1e81 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -43,8 +43,15 @@ class RedTubeIE(InfoExtractor): webpage = self._download_webpage( 'http://www.redtube.com/%s' % video_id, video_id) - if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): - raise ExtractorError('Video %s has been removed' % video_id, expected=True) + ERRORS = ( + (('video-deleted-info', '>This video has been removed'), 'has been removed'), + (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'), + ) + + for patterns, message in ERRORS: + if any(p in webpage for p in patterns): + raise ExtractorError( + 'Video %s %s' % (video_id, message), expected=True) info = self._search_json_ld(webpage, video_id, default={}) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index bd9ee1647..2cc665122 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -8,7 +8,6 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_str, compat_urlparse, ) from ..utils import ( @@ -39,13 +38,13 @@ class SafariBaseIE(InfoExtractor): 'Downloading login page') def is_logged(urlh): - return 'learning.oreilly.com/home/' in compat_str(urlh.geturl()) + return 'learning.oreilly.com/home/' in urlh.geturl() if is_logged(urlh): self.LOGGED_IN = True return - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() parsed_url = compat_urlparse.urlparse(redirect_url) qs = compat_parse_qs(parsed_url.query) next_uri = compat_urlparse.urljoin( @@ -165,7 +164,8 @@ class SafariIE(SafariBaseIE): kaltura_session = self._download_json( '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), video_id, 'Downloading kaltura session JSON', - 'Unable to download kaltura session JSON', fatal=False) + 'Unable to download kaltura session JSON', fatal=False, + headers={'Accept': 'application/json'}) if kaltura_session: session = kaltura_session.get('session') if session: diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index 8b3275735..b40b4c4af 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -7,6 +7,7 @@ import re from .aws import AWSIE from .anvato import AnvatoIE +from .common import InfoExtractor from ..utils import ( smuggle_url, urlencode_postdata, @@ -102,3 +103,50 @@ class ScrippsNetworksWatchIE(AWSIE): 'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id, {'geo_countries': ['US']}), AnvatoIE.ie_key(), video_id=mcp_id) + + +class ScrippsNetworksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?P<site>cookingchanneltv|discovery|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.cookingchanneltv.com/videos/the-best-of-the-best-0260338', + 'info_dict': { + 'id': '0260338', + 'ext': 'mp4', + 'title': 'The Best of the Best', + 'description': 'Catch a new episode of MasterChef Canada Tuedsay at 9/8c.', + 'timestamp': 1475678834, + 'upload_date': '20161005', + 'uploader': 'SCNI-SCND', + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', + 'only_matching': True, + }, { + 'url': 'https://www.foodnetwork.com/videos/chocolate-strawberry-cake-roll-7524591', + 'only_matching': True, + }, { + 'url': 'https://www.hgtv.com/videos/cookie-decorating-101-0301929', + 'only_matching': True, + }, { + 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', + 'only_matching': True, + }, { + 'url': 'https://www.discovery.com/videos/guardians-of-the-glades-cooking-with-tom-cobb-5578368', + 'only_matching': True, + }] + _ACCOUNT_MAP = { + 'cookingchanneltv': 2433005105, + 'discovery': 2706091867, + 'diynetwork': 2433004575, + 'foodnetwork': 2433005105, + 'hgtv': 2433004575, + 'travelchannel': 2433005739, + } + _TP_TEMPL = 'https://link.theplatform.com/s/ip77QC/media/guid/%d/%s?mbr=true' + + def _real_extract(self, url): + site, guid = re.match(self._VALID_URL, url).groups() + return self.url_result(smuggle_url( + self._TP_TEMPL % (self._ACCOUNT_MAP[site], guid), + {'force_smil_url': True}), 'ThePlatform', guid) diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index e579d42cf..9401bf2cf 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -7,9 +7,18 @@ from .common import InfoExtractor class ServusIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| + servustv\.com/videos + ) + /(?P<id>[aA]{2}-\w+|\d+-\d+) + ''' _TESTS = [{ - 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + # new URL schema + 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', @@ -18,6 +27,10 @@ class ServusIE(InfoExtractor): 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', } + }, { + # old URL schema + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + 'only_matching': True, }, { 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', 'only_matching': True, diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index ff575f592..02295d1a4 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -1,13 +1,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_b64decode +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote_plus, +) from ..utils import ( determine_ext, ExtractorError, int_or_none, + js_to_json, KNOWN_EXTENSIONS, parse_filesize, + rot47, url_or_none, urlencode_postdata, ) @@ -112,16 +117,22 @@ class VivoIE(SharedBaseIE): webpage, 'filesize', fatal=False)) def _extract_video_url(self, webpage, video_id, url): - def decode_url(encoded_url): + def decode_url_old(encoded_url): return compat_b64decode(encoded_url).decode('utf-8') - stream_url = url_or_none(decode_url(self._search_regex( + stream_url = self._search_regex( r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'stream url', default=None, group='url'))) + 'stream url', default=None, group='url') + if stream_url: + stream_url = url_or_none(decode_url_old(stream_url)) if stream_url: return stream_url - return self._parse_json( + + def decode_url(encoded_url): + return rot47(compat_urllib_parse_unquote_plus(encoded_url)) + + return decode_url(self._parse_json( self._search_regex( - r'InitializeStream\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'stream', group='url'), - video_id, transform_source=decode_url)[0] + r'(?s)InitializeStream\s*\(\s*({.+?})\s*\)\s*;', webpage, + 'stream'), + video_id, transform_source=js_to_json)['source']) diff --git a/youtube_dl/extractor/slideslive.py b/youtube_dl/extractor/slideslive.py index ed84322c5..d9ea76831 100644 --- a/youtube_dl/extractor/slideslive.py +++ b/youtube_dl/extractor/slideslive.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import smuggle_url class SlidesLiveIE(InfoExtractor): @@ -14,9 +14,9 @@ class SlidesLiveIE(InfoExtractor): 'info_dict': { 'id': 'LMtgR8ba0b0', 'ext': 'mp4', - 'title': '38902413: external video', - 'description': '3890241320170925-9-1yd6ech.mp4', - 'uploader': 'SlidesLive Administrator', + 'title': 'GCC IA16 backend', + 'description': 'Watch full version of this video at https://slideslive.com/38902413.', + 'uploader': 'SlidesLive Videos - A', 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', 'upload_date': '20170925', } @@ -24,16 +24,38 @@ class SlidesLiveIE(InfoExtractor): # video_service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', 'only_matching': True, + }, { + # video_service_name = url + 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1', + 'only_matching': True, + }, { + # video_service_name = vimeo + 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( - url, video_id, headers={'Accept': 'application/json'}) + 'https://ben.slideslive.com/player/' + video_id, video_id) service_name = video_data['video_service_name'].lower() - if service_name == 'youtube': - yt_video_id = video_data['video_service_id'] - return self.url_result(yt_video_id, 'Youtube', video_id=yt_video_id) + assert service_name in ('url', 'vimeo', 'youtube') + service_id = video_data['video_service_id'] + info = { + 'id': video_id, + 'thumbnail': video_data.get('thumbnail'), + 'url': service_id, + } + if service_name == 'url': + info['title'] = video_data['title'] else: - raise ExtractorError( - 'Unsupported service name: {0}'.format(service_name), expected=True) + info.update({ + '_type': 'url_transparent', + 'ie_key': service_name.capitalize(), + 'title': video_data.get('title'), + }) + if service_name == 'vimeo': + info['url'] = smuggle_url( + 'https://player.vimeo.com/video/' + service_id, + {'http_headers': {'Referer': url}}) + return info diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index e8ffb2cbe..422ce1626 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -9,10 +9,13 @@ from .common import ( SearchInfoExtractor ) from ..compat import ( + compat_HTTPError, + compat_kwargs, compat_str, compat_urlparse, ) from ..utils import ( + error_to_compat_str, ExtractorError, float_or_none, HEADRequest, @@ -24,11 +27,17 @@ from ..utils import ( unified_timestamp, update_url_query, url_or_none, + urlhandle_detect_ext, ) class SoundcloudEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?url=(?P<id>.*)' + _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)' + _TEST = { + # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/ + 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey', + 'only_matching': True, + } @staticmethod def _extract_urls(webpage): @@ -37,8 +46,13 @@ class SoundcloudEmbedIE(InfoExtractor): webpage)] def _real_extract(self, url): - return self.url_result(compat_urlparse.parse_qs( - compat_urlparse.urlparse(url).query)['url'][0]) + query = compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + api_url = query['url'][0] + secret_token = query.get('secret_token') + if secret_token: + api_url = update_url_query(api_url, {'secret_token': secret_token[0]}) + return self.url_result(api_url) class SoundcloudIE(InfoExtractor): @@ -83,7 +97,7 @@ class SoundcloudIE(InfoExtractor): 'repost_count': int, } }, - # not streamable song + # geo-restricted { 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'info_dict': { @@ -95,18 +109,13 @@ class SoundcloudIE(InfoExtractor): 'uploader_id': '9615865', 'timestamp': 1337635207, 'upload_date': '20120521', - 'duration': 30, + 'duration': 227.155, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, - 'params': { - # rtmp - 'skip_download': True, - }, - 'skip': 'Preview', }, # private link { @@ -217,7 +226,6 @@ class SoundcloudIE(InfoExtractor): 'skip_download': True, }, }, - # not available via api.soundcloud.com/i1/tracks/id/streams { 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', @@ -226,7 +234,7 @@ class SoundcloudIE(InfoExtractor): 'ext': 'mp3', 'title': 'Mezzo Valzer', 'description': 'md5:4138d582f81866a530317bae316e8b61', - 'uploader': 'Giovanni Sarani', + 'uploader': 'Micronie', 'uploader_id': '3352531', 'timestamp': 1551394171, 'upload_date': '20190228', @@ -238,14 +246,16 @@ class SoundcloudIE(InfoExtractor): 'comment_count': int, 'repost_count': int, }, - 'expected_warnings': ['Unable to download JSON metadata'], - } + }, + { + # with AAC HQ format available via OAuth token + 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', + 'only_matching': True, + }, ] - _API_BASE = 'https://api.soundcloud.com/' _API_V2_BASE = 'https://api-v2.soundcloud.com/' _BASE_URL = 'https://soundcloud.com/' - _CLIENT_ID = 'BeGVhOrGmfboy1LtiHTQF6Ejpt9ULJCI' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' _ARTWORK_MAP = { @@ -261,14 +271,53 @@ class SoundcloudIE(InfoExtractor): 'original': 0, } + def _store_client_id(self, client_id): + self._downloader.cache.store('soundcloud', 'client_id', client_id) + + def _update_client_id(self): + webpage = self._download_webpage('https://soundcloud.com/', None) + for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)): + script = self._download_webpage(src, None, fatal=False) + if script: + client_id = self._search_regex( + r'client_id\s*:\s*"([0-9a-zA-Z]{32})"', + script, 'client id', default=None) + if client_id: + self._CLIENT_ID = client_id + self._store_client_id(client_id) + return + raise ExtractorError('Unable to extract client id') + + def _download_json(self, *args, **kwargs): + non_fatal = kwargs.get('fatal') is False + if non_fatal: + del kwargs['fatal'] + query = kwargs.get('query', {}).copy() + for _ in range(2): + query['client_id'] = self._CLIENT_ID + kwargs['query'] = query + try: + return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs)) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self._store_client_id(None) + self._update_client_id() + continue + elif non_fatal: + self._downloader.report_warning(error_to_compat_str(e)) + return False + raise + + def _real_initialize(self): + self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' + @classmethod def _resolv_url(cls, url): - return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + '&client_id=' + cls._CLIENT_ID + return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url - def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2): + def _extract_info_dict(self, info, full_title=None, secret_token=None): track_id = compat_str(info['id']) title = info['title'] - track_base_url = self._API_BASE + 'tracks/%s' % track_id format_urls = set() formats = [] @@ -277,26 +326,27 @@ class SoundcloudIE(InfoExtractor): query['secret_token'] = secret_token if info.get('downloadable') and info.get('has_downloads_left'): - format_url = update_url_query( - info.get('download_url') or track_base_url + '/download', query) - format_urls.add(format_url) - if version == 2: - v1_info = self._download_json( - track_base_url, track_id, query=query, fatal=False) or {} - else: - v1_info = info - formats.append({ - 'format_id': 'download', - 'ext': v1_info.get('original_format') or 'mp3', - 'filesize': int_or_none(v1_info.get('original_content_size')), - 'url': format_url, - 'preference': 10, - }) + download_url = update_url_query( + self._API_V2_BASE + 'tracks/' + track_id + '/download', query) + redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') + if redirect_url: + urlh = self._request_webpage( + HEADRequest(redirect_url), track_id, fatal=False) + if urlh: + format_url = urlh.geturl() + format_urls.add(format_url) + formats.append({ + 'format_id': 'download', + 'ext': urlhandle_detect_ext(urlh) or 'mp3', + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + 'url': format_url, + 'preference': 10, + }) def invalid_url(url): - return not url or url in format_urls or re.search(r'/(?:preview|playlist)/0/30/', url) + return not url or url in format_urls - def add_format(f, protocol): + def add_format(f, protocol, is_preview=False): mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) if mobj: for k, v in mobj.groupdict().items(): @@ -305,16 +355,27 @@ class SoundcloudIE(InfoExtractor): format_id_list = [] if protocol: format_id_list.append(protocol) + ext = f.get('ext') + if ext == 'aac': + f['abr'] = '256' for k in ('ext', 'abr'): v = f.get(k) if v: format_id_list.append(v) + preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) + if preview: + format_id_list.append('preview') abr = f.get('abr') if abr: f['abr'] = int(abr) + if protocol == 'hls': + protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' + else: + protocol = 'http' f.update({ 'format_id': '_'.join(format_id_list), - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'protocol': protocol, + 'preference': -10 if preview else None, }) formats.append(f) @@ -325,7 +386,7 @@ class SoundcloudIE(InfoExtractor): if not isinstance(t, dict): continue format_url = url_or_none(t.get('url')) - if not format_url or t.get('snipped') or '/preview/' in format_url: + if not format_url: continue stream = self._download_json( format_url, track_id, query=query, fatal=False) @@ -348,44 +409,14 @@ class SoundcloudIE(InfoExtractor): add_format({ 'url': stream_url, 'ext': ext, - }, 'http' if protocol == 'progressive' else protocol) - - if not formats: - # Old API, does not work for some tracks (e.g. - # https://soundcloud.com/giovannisarani/mezzo-valzer) - # and might serve preview URLs (e.g. - # http://www.soundcloud.com/snbrn/ele) - format_dict = self._download_json( - track_base_url + '/streams', track_id, - 'Downloading track url', query=query, fatal=False) or {} - - for key, stream_url in format_dict.items(): - if invalid_url(stream_url): - continue - format_urls.add(stream_url) - mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key) - if mobj: - protocol, ext, abr = mobj.groups() - add_format({ - 'abr': abr, - 'ext': ext, - 'url': stream_url, - }, protocol) - - if not formats: - # We fallback to the stream_url in the original info, this - # cannot be always used, sometimes it can give an HTTP 404 error - urlh = self._request_webpage( - HEADRequest(info.get('stream_url') or track_base_url + '/stream'), - track_id, query=query, fatal=False) - if urlh: - stream_url = urlh.geturl() - if not invalid_url(stream_url): - add_format({'url': stream_url}, 'http') + }, 'http' if protocol == 'progressive' else protocol, + t.get('snipped') or '/preview/' in format_url) for f in formats: f['vcodec'] = 'none' + if not formats and info.get('policy') == 'BLOCK': + self.raise_geo_restricted() self._sort_formats(formats) user = info.get('user') or {} @@ -441,9 +472,7 @@ class SoundcloudIE(InfoExtractor): track_id = mobj.group('track_id') - query = { - 'client_id': self._CLIENT_ID, - } + query = {} if track_id: info_json_url = self._API_V2_BASE + 'tracks/' + track_id full_title = track_id @@ -457,20 +486,24 @@ class SoundcloudIE(InfoExtractor): resolve_title += '/%s' % token info_json_url = self._resolv_url(self._BASE_URL + resolve_title) - version = 2 info = self._download_json( - info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False) - if not info: - info = self._download_json( - info_json_url.replace(self._API_V2_BASE, self._API_BASE), - full_title, 'Downloading info JSON', query=query) - version = 1 + info_json_url, full_title, 'Downloading info JSON', query=query) - return self._extract_info_dict(info, full_title, token, version) + return self._extract_info_dict(info, full_title, token) class SoundcloudPlaylistBaseIE(SoundcloudIE): - def _extract_track_entries(self, tracks, token=None): + def _extract_set(self, playlist, token=None): + playlist_id = compat_str(playlist['id']) + tracks = playlist.get('tracks') or [] + if not all([t.get('permalink_url') for t in tracks]) and token: + tracks = self._download_json( + self._API_V2_BASE + 'tracks', playlist_id, + 'Downloading tracks', query={ + 'ids': ','.join([compat_str(t['id']) for t in tracks]), + 'playlistId': playlist_id, + 'playlistSecretToken': token, + }) entries = [] for track in tracks: track_id = str_or_none(track.get('id')) @@ -483,7 +516,10 @@ class SoundcloudPlaylistBaseIE(SoundcloudIE): url += '?secret_token=' + token entries.append(self.url_result( url, SoundcloudIE.ie_key(), track_id)) - return entries + return self.playlist_result( + entries, playlist_id, + playlist.get('title'), + playlist.get('description')) class SoundcloudSetIE(SoundcloudPlaylistBaseIE): @@ -494,6 +530,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): 'info_dict': { 'id': '2284613', 'title': 'The Royal Concept EP', + 'description': 'md5:71d07087c7a449e8941a70a29e34671e', }, 'playlist_mincount': 5, }, { @@ -516,17 +553,13 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) - entries = self._extract_track_entries(info['tracks'], token) - - return self.playlist_result( - entries, str_or_none(info.get('id')), info.get('title')) + return self._extract_set(info, token) -class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): +class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): COMMON_QUERY = { 'limit': 2000000000, - 'client_id': self._CLIENT_ID, 'linked_partitioning': '1', } @@ -712,9 +745,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') - query = { - 'client_id': self._CLIENT_ID, - } + query = {} token = mobj.group('token') if token: query['secret_token'] = token @@ -723,10 +754,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): self._API_V2_BASE + 'playlists/' + playlist_id, playlist_id, 'Downloading playlist', query=query) - entries = self._extract_track_entries(data['tracks'], token) - - return self.playlist_result( - entries, playlist_id, data.get('title'), data.get('description')) + return self._extract_set(data, token) class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): @@ -751,7 +779,6 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): self._MAX_RESULTS_PER_PAGE) query.update({ 'limit': limit, - 'client_id': self._CLIENT_ID, 'linked_partitioning': 1, 'offset': 0, }) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index e040ada29..61ca902ce 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, merge_dicts, orderedSet, @@ -64,7 +65,7 @@ class SpankBangIE(InfoExtractor): url.replace('/%s/embed' % video_id, '/%s/video' % video_id), video_id, headers={'Cookie': 'country=US'}) - if re.search(r'<[^>]+\bid=["\']video_removed', webpage): + if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage): raise ExtractorError( 'Video %s is not available' % video_id, expected=True) @@ -75,11 +76,20 @@ class SpankBangIE(InfoExtractor): if not f_url: return f = parse_resolution(format_id) - f.update({ - 'url': f_url, - 'format_id': format_id, - }) - formats.append(f) + ext = determine_ext(f_url) + if format_id.startswith('m3u8') or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id.startswith('mpd') or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'mp4' or f.get('width') or f.get('height'): + f.update({ + 'url': f_url, + 'format_id': format_id, + }) + formats.append(f) STREAM_URL_PREFIX = 'stream_url_' @@ -93,28 +103,22 @@ class SpankBangIE(InfoExtractor): r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'stream key', group='value') - sb_csrf_session = self._get_cookies( - 'https://spankbang.com')['sb_csrf_session'].value - stream = self._download_json( 'https://spankbang.com/api/videos/stream', video_id, 'Downloading stream JSON', data=urlencode_postdata({ 'id': stream_key, 'data': 0, - 'sb_csrf_session': sb_csrf_session, }), headers={ 'Referer': url, - 'X-CSRFToken': sb_csrf_session, + 'X-Requested-With': 'XMLHttpRequest', }) for format_id, format_url in stream.items(): - if format_id.startswith(STREAM_URL_PREFIX): - if format_url and isinstance(format_url, list): - format_url = format_url[0] - extract_format( - format_id[len(STREAM_URL_PREFIX):], format_url) + if format_url and isinstance(format_url, list): + format_url = format_url[0] + extract_format(format_id, format_url) - self._sort_formats(formats) + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) info = self._search_json_ld(webpage, video_id, default={}) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 44d8fa52f..35ab9ec37 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -3,34 +3,47 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) from ..utils import ( - sanitized_Request, + float_or_none, + int_or_none, + merge_dicts, + str_or_none, str_to_int, - unified_strdate, + url_or_none, ) -from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<id>[0-9]+)/?)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?spankwire\.com/ + (?: + [^/]+/video| + EmbedPlayer\.aspx/?\?.*?\bArticleId= + ) + (?P<id>\d+) + ''' _TESTS = [{ # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'md5': '8bbfde12b101204b39e4b9fe7eb67095', + 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', 'info_dict': { 'id': '103545', 'ext': 'mp4', 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', 'description': 'Crazy Bitch X rated music video.', + 'duration': 222, 'uploader': 'oreusz', 'uploader_id': '124697', - 'upload_date': '20070507', + 'timestamp': 1178587885, + 'upload_date': '20070508', + 'average_rating': float, + 'view_count': int, + 'comment_count': int, 'age_limit': 18, - } + 'categories': list, + 'tags': list, + }, }, { # download URL pattern: */mp4_<format_id>_<video_id>.mp4 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', @@ -45,83 +58,125 @@ class SpankwireIE(InfoExtractor): 'upload_date': '20150822', 'age_limit': 18, }, + 'params': { + 'proxy': '127.0.0.1:8118' + }, + 'skip': 'removed', + }, { + 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', + 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', + webpage) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - req = sanitized_Request('http://www.' + mobj.group('url')) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + video = self._download_json( + 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) - title = self._html_search_regex( - r'<h1>([^<]+)', webpage, 'title') - description = self._html_search_regex( - r'(?s)<div\s+id="descriptionContent">(.+?)</div>', - webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', - webpage, 'thumbnail', fatal=False) - - uploader = self._html_search_regex( - r'by:\s*<a [^>]*>(.+?)</a>', - webpage, 'uploader', fatal=False) - uploader_id = self._html_search_regex( - r'by:\s*<a href="/(?:user/viewProfile|Profile\.aspx)\?.*?UserId=(\d+).*?"', - webpage, 'uploader id', fatal=False) - upload_date = unified_strdate(self._html_search_regex( - r'</a> on (.+?) at \d+:\d+', - webpage, 'upload date', fatal=False)) - - view_count = str_to_int(self._html_search_regex( - r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>', - webpage, 'view count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>', - webpage, 'comment count', fatal=False)) - - videos = re.findall( - r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) - heights = [int(video[0]) for video in videos] - video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos])) - if webpage.find(r'flashvars\.encrypted = "true"') != -1: - password = self._search_regex( - r'flashvars\.video_title = "([^"]+)', - webpage, 'password').replace('+', ' ') - video_urls = list(map( - lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), - video_urls)) + title = video['title'] formats = [] - for height, video_url in zip(heights, video_urls): - path = compat_urllib_parse_urlparse(video_url).path - m = re.search(r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', path) - if m: - tbr = int(m.group('tbr')) - height = int(m.group('height')) - else: - tbr = None - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height, - 'height': height, - 'tbr': tbr, + videos = video.get('videos') + if isinstance(videos, dict): + for format_id, format_url in videos.items(): + video_url = url_or_none(format_url) + if not format_url: + continue + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + m = re.search( + r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url) + if m: + tbr = int(m.group('tbr')) + height = height or int(m.group('height')) + else: + tbr = None + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else format_id, + 'height': height, + 'tbr': tbr, + }) + m3u8_url = url_or_none(video.get('HLS')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id')) + + view_count = str_to_int(video.get('viewed')) + + thumbnails = [] + for preference, t in enumerate(('', '2x'), start=0): + thumbnail_url = url_or_none(video.get('poster%s' % t)) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'preference': preference, }) - self._sort_formats(formats) - age_limit = self._rta_search(webpage) + def extract_names(key): + entries_list = video.get(key) + if not isinstance(entries_list, list): + return + entries = [] + for entry in entries_list: + name = str_or_none(entry.get('name')) + if name: + entries.append(name) + return entries - return { + categories = extract_names('categories') + tags = extract_names('tags') + + uploader = None + info = {} + + webpage = self._download_webpage( + 'https://www.spankwire.com/_/video%s/' % video_id, video_id, + fatal=False) + if webpage: + info = self._search_json_ld(webpage, video_id, default={}) + thumbnail_url = None + if 'thumbnail' in info: + thumbnail_url = url_or_none(info['thumbnail']) + del info['thumbnail'] + if not thumbnail_url: + thumbnail_url = self._og_search_thumbnail(webpage) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'preference': 10, + }) + uploader = self._html_search_regex( + r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>', + webpage, 'uploader', fatal=False) + if not view_count: + view_count = str_to_int(self._search_regex( + r'data-views=["\']([\d,.]+)', webpage, 'view count', + fatal=False)) + + return merge_dicts({ 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'thumbnails': thumbnails, 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'uploader_id': str_or_none(video.get('userId')), + 'timestamp': int_or_none(video.get('time_approved_on')), + 'average_rating': float_or_none(video.get('rating')), 'view_count': view_count, - 'comment_count': comment_count, + 'comment_count': int_or_none(video.get('comments')), + 'age_limit': 18, + 'categories': categories, + 'tags': tags, 'formats': formats, - 'age_limit': age_limit, - } + }, info) diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index a3c35a899..378fc7568 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -13,36 +13,18 @@ from ..utils import ( class SportDeutschlandIE(InfoExtractor): _VALID_URL = r'https?://sportdeutschland\.tv/(?P<sport>[^/?#]+)/(?P<id>[^?#/]+)(?:$|[?#])' _TESTS = [{ - 'url': 'http://sportdeutschland.tv/badminton/live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { - 'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', + 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', 'ext': 'mp4', - 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', - 'categories': ['Badminton'], + 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', + 'categories': ['Badminton-Deutschland'], 'view_count': int, - 'thumbnail': r're:^https?://.*\.jpg$', - 'description': r're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', 'timestamp': int, - 'upload_date': 're:^201408[23][0-9]$', + 'upload_date': '20200201', + 'description': 're:.*', # meaningless description for THIS video }, - 'params': { - 'skip_download': 'Live stream', - }, - }, { - 'url': 'http://sportdeutschland.tv/li-ning-badminton-wm-2014/lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs', - 'info_dict': { - 'id': 'lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs', - 'ext': 'mp4', - 'upload_date': '20140825', - 'description': 'md5:60a20536b57cee7d9a4ec005e8687504', - 'timestamp': 1408976060, - 'duration': 2732, - 'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee', - 'thumbnail': r're:^https?://.*\.jpg$', - 'view_count': int, - 'categories': ['Li-Ning Badminton WM 2014'], - - } }] def _real_extract(self, url): @@ -50,7 +32,7 @@ class SportDeutschlandIE(InfoExtractor): video_id = mobj.group('id') sport_id = mobj.group('sport') - api_url = 'http://proxy.vidibusdynamic.net/sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( + api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( sport_id, video_id) req = sanitized_Request(api_url, headers={ 'Accept': 'application/vnd.vidibus.v2.html+json', diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 28baf901c..359dadaa3 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -1,14 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .ard import ARDMediathekIE +from .ard import ARDMediathekBaseIE from ..utils import ( ExtractorError, get_element_by_attribute, ) -class SRMediathekIE(ARDMediathekIE): +class SRMediathekIE(ARDMediathekBaseIE): IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py deleted file mode 100644 index f1e17dd88..000000000 --- a/youtube_dl/extractor/streamango.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_chr -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - js_to_json, -) - - -class StreamangoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net|streamcherry\.com)/(?:f|embed)/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', - 'md5': 'e992787515a182f55e38fc97588d802a', - 'info_dict': { - 'id': 'clapasobsptpkdfe', - 'ext': 'mp4', - 'title': '20170315_150006.mp4', - } - }, { - # no og:title - 'url': 'https://streamango.com/embed/foqebrpftarclpob/asdf_asd_2_mp4', - 'info_dict': { - 'id': 'foqebrpftarclpob', - 'ext': 'mp4', - 'title': 'foqebrpftarclpob', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'gone', - }, { - 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', - 'only_matching': True, - }, { - 'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4', - 'only_matching': True, - }, { - 'url': 'https://streamcherry.com/f/clapasobsptpkdfe/', - 'only_matching': True, - }] - - def _real_extract(self, url): - def decrypt_src(encoded, val): - ALPHABET = '=/+9876543210zyxwvutsrqponmlkjihgfedcbaZYXWVUTSRQPONMLKJIHGFEDCBA' - encoded = re.sub(r'[^A-Za-z0-9+/=]', '', encoded) - decoded = '' - sm = [None] * 4 - i = 0 - str_len = len(encoded) - while i < str_len: - for j in range(4): - sm[j % 4] = ALPHABET.index(encoded[i]) - i += 1 - char_code = ((sm[0] << 0x2) | (sm[1] >> 0x4)) ^ val - decoded += compat_chr(char_code) - if sm[2] != 0x40: - char_code = ((sm[1] & 0xf) << 0x4) | (sm[2] >> 0x2) - decoded += compat_chr(char_code) - if sm[3] != 0x40: - char_code = ((sm[2] & 0x3) << 0x6) | sm[3] - decoded += compat_chr(char_code) - return decoded - - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage, default=video_id) - - formats = [] - for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): - mobj = re.search(r'(src\s*:\s*[^(]+\(([^)]*)\)[\s,]*)', format_) - if mobj is None: - continue - - format_ = format_.replace(mobj.group(0), '') - - video = self._parse_json( - format_, video_id, transform_source=js_to_json, - fatal=False) or {} - - mobj = re.search( - r'([\'"])(?P<src>(?:(?!\1).)+)\1\s*,\s*(?P<val>\d+)', - mobj.group(1)) - if mobj is None: - continue - - src = decrypt_src(mobj.group('src'), int_or_none(mobj.group('val'))) - if not src: - continue - - ext = determine_ext(src, default_ext=None) - if video.get('type') == 'application/dash+xml' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': src, - 'ext': ext or 'mp4', - 'width': int_or_none(video.get('width')), - 'height': int_or_none(video.get('height')), - 'tbr': int_or_none(video.get('bitrate')), - }) - - if not formats: - error = self._search_regex( - r'<p[^>]+\bclass=["\']lead[^>]+>(.+?)</p>', webpage, - 'error', default=None) - if not error and '>Sorry' in webpage: - error = 'Video %s is not available' % video_id - if error: - raise ExtractorError(error, expected=True) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'url': url, - 'title': title, - 'formats': formats, - } diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py index ae2ac1b42..4dbead2ba 100644 --- a/youtube_dl/extractor/stretchinternet.py +++ b/youtube_dl/extractor/stretchinternet.py @@ -5,44 +5,28 @@ from ..utils import int_or_none class StretchInternetIE(InfoExtractor): - _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/portal\.htm\?.*?\beventId=(?P<id>\d+)' + _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/(?:portal|full)\.htm\?.*?\beventId=(?P<id>\d+)' _TEST = { - 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=313900&streamType=video', + 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=573272&streamType=video', 'info_dict': { - 'id': '313900', + 'id': '573272', 'ext': 'mp4', - 'title': 'Augustana (S.D.) Baseball vs University of Mary', - 'description': 'md5:7578478614aae3bdd4a90f578f787438', - 'timestamp': 1490468400, - 'upload_date': '20170325', + 'title': 'University of Mary Wrestling vs. Upper Iowa', + 'timestamp': 1575668361, + 'upload_date': '20191206', } } def _real_extract(self, url): video_id = self._match_id(url) - stream = self._download_json( - 'https://neo-client.stretchinternet.com/streamservice/v1/media/stream/v%s' - % video_id, video_id) - - video_url = 'https://%s' % stream['source'] - event = self._download_json( - 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', - video_id, query={ - 'clientID': 99997, - 'eventID': video_id, - 'token': 'asdf', - })['event'] - - title = event.get('title') or event['mobileTitle'] - description = event.get('customText') - timestamp = int_or_none(event.get('longtime')) + 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id, + video_id)[0] return { 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'url': video_url, + 'title': event['title'], + 'timestamp': int_or_none(event.get('dateCreated'), 1000), + 'url': 'https://' + event['media'][0]['url'], } diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 0901c3163..e12389cad 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -4,19 +4,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) +from ..compat import compat_str from ..utils import ( determine_ext, dict_get, int_or_none, - orderedSet, + str_or_none, strip_or_none, try_get, - urljoin, - compat_str, ) @@ -237,23 +232,23 @@ class SVTPlayIE(SVTPlayBaseIE): class SVTSeriesIE(SVTPlayBaseIE): - _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)' + _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)(?:.+?\btab=(?P<season_slug>[^&#]+))?' _TESTS = [{ 'url': 'https://www.svtplay.se/rederiet', 'info_dict': { - 'id': 'rederiet', + 'id': '14445680', 'title': 'Rederiet', - 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', }, 'playlist_mincount': 318, }, { - 'url': 'https://www.svtplay.se/rederiet?tab=sasong2', + 'url': 'https://www.svtplay.se/rederiet?tab=season-2-14445680', 'info_dict': { - 'id': 'rederiet-sasong2', + 'id': 'season-2-14445680', 'title': 'Rederiet - Säsong 2', - 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', }, - 'playlist_count': 12, + 'playlist_mincount': 12, }] @classmethod @@ -261,83 +256,87 @@ class SVTSeriesIE(SVTPlayBaseIE): return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) def _real_extract(self, url): - series_id = self._match_id(url) + series_slug, season_id = re.match(self._VALID_URL, url).groups() - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - season_slug = qs.get('tab', [None])[0] - - if season_slug: - series_id += '-%s' % season_slug - - webpage = self._download_webpage( - url, series_id, 'Downloading series page') - - root = self._parse_json( - self._search_regex( - self._SVTPLAY_RE, webpage, 'content', group='json'), - series_id) + series = self._download_json( + 'https://api.svt.se/contento/graphql', series_slug, + 'Downloading series page', query={ + 'query': '''{ + listablesBySlug(slugs: ["%s"]) { + associatedContent(include: [productionPeriod, season]) { + items { + item { + ... on Episode { + videoSvtId + } + } + } + id + name + } + id + longDescription + name + shortDescription + } +}''' % series_slug, + })['data']['listablesBySlug'][0] season_name = None entries = [] - for season in root['relatedVideoContent']['relatedVideosAccordion']: + for season in series['associatedContent']: if not isinstance(season, dict): continue - if season_slug: - if season.get('slug') != season_slug: + if season_id: + if season.get('id') != season_id: continue season_name = season.get('name') - videos = season.get('videos') - if not isinstance(videos, list): + items = season.get('items') + if not isinstance(items, list): continue - for video in videos: - content_url = video.get('contentUrl') - if not content_url or not isinstance(content_url, compat_str): + for item in items: + video = item.get('item') or {} + content_id = video.get('videoSvtId') + if not content_id or not isinstance(content_id, compat_str): continue - entries.append( - self.url_result( - urljoin(url, content_url), - ie=SVTPlayIE.ie_key(), - video_title=video.get('title') - )) + entries.append(self.url_result( + 'svt:' + content_id, SVTPlayIE.ie_key(), content_id)) - metadata = root.get('metaData') - if not isinstance(metadata, dict): - metadata = {} - - title = metadata.get('title') - season_name = season_name or season_slug + title = series.get('name') + season_name = season_name or season_id if title and season_name: title = '%s - %s' % (title, season_name) - elif season_slug: - title = season_slug + elif season_id: + title = season_id return self.playlist_result( - entries, series_id, title, metadata.get('description')) + entries, season_id or series.get('id'), title, + dict_get(series, ('longDescription', 'shortDescription'))) class SVTPageIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)' + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?P<path>(?:[^/]+/)*(?P<id>[^/?&#]+))' _TESTS = [{ - 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa', 'info_dict': { - 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', - 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', + 'id': '25298267', + 'title': 'Bakom masken – Lehners kamp mot mental ohälsa', }, - 'playlist_count': 7, + 'playlist_count': 4, }, { - 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien', 'info_dict': { - 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', - 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', + 'id': '24243746', + 'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien', }, - 'playlist_count': 1, + 'playlist_count': 2, }, { # only programTitle 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', 'info_dict': { - 'id': '2900353', + 'id': '8439V2K', 'ext': 'mp4', 'title': 'Stjärnorna skojar till det - under SVT-intervjun', 'duration': 27, @@ -356,16 +355,26 @@ class SVTPageIE(InfoExtractor): return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): - playlist_id = self._match_id(url) + path, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, playlist_id) + article = self._download_json( + 'https://api.svt.se/nss-api/page/' + path, display_id, + query={'q': 'articles'})['articles']['content'][0] - entries = [ - self.url_result( - 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( - r'data-video-id=["\'](\d+)', webpage))] + entries = [] - title = strip_or_none(self._og_search_title(webpage, default=None)) + def _process_content(content): + if content.get('_type') in ('VIDEOCLIP', 'VIDEOEPISODE'): + video_id = compat_str(content['image']['svtId']) + entries.append(self.url_result( + 'svt:' + video_id, SVTPlayIE.ie_key(), video_id)) - return self.playlist_result(entries, playlist_id, title) + for media in article.get('media', []): + _process_content(media) + + for obj in article.get('structuredBody', []): + _process_content(obj.get('content') or {}) + + return self.playlist_result( + entries, str_or_none(article.get('id')), + strip_or_none(article.get('title'))) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 7d2e34b3b..a75369dbe 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -4,11 +4,12 @@ import re from .common import InfoExtractor from .wistia import WistiaIE -from ..compat import compat_str from ..utils import ( clean_html, ExtractorError, + int_or_none, get_element_by_class, + strip_or_none, urlencode_postdata, urljoin, ) @@ -20,8 +21,8 @@ class TeachableBaseIE(InfoExtractor): _SITES = { # Only notable ones here - 'upskillcourses.com': 'upskill', - 'academy.gns3.com': 'gns3', + 'v1.upskillcourses.com': 'upskill', + 'gns3.teachable.com': 'gns3', 'academyhacker.com': 'academyhacker', 'stackskills.com': 'stackskills', 'market.saleshacker.com': 'saleshacker', @@ -58,7 +59,7 @@ class TeachableBaseIE(InfoExtractor): self._logged_in = True return - login_url = compat_str(urlh.geturl()) + login_url = urlh.geturl() login_form = self._hidden_inputs(login_page) @@ -110,27 +111,29 @@ class TeachableIE(TeachableBaseIE): ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364', 'info_dict': { - 'id': 'uzw6zw58or', - 'ext': 'mp4', - 'title': 'Welcome to the Course!', - 'description': 'md5:65edb0affa582974de4625b9cdea1107', - 'duration': 138.763, - 'timestamp': 1479846621, - 'upload_date': '20161122', + 'id': 'untlgzk1v7', + 'ext': 'bin', + 'title': 'Overview', + 'description': 'md5:071463ff08b86c208811130ea1c2464c', + 'duration': 736.4, + 'timestamp': 1542315762, + 'upload_date': '20181115', + 'chapter': 'Welcome', + 'chapter_number': 1, }, 'params': { 'skip_download': True, }, }, { - 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', + 'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100', 'only_matching': True, }, { - 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939', + 'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939', 'only_matching': True, }, { - 'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', 'only_matching': True, }] @@ -160,22 +163,51 @@ class TeachableIE(TeachableBaseIE): webpage = self._download_webpage(url, video_id) - wistia_url = WistiaIE._extract_url(webpage) - if not wistia_url: + wistia_urls = WistiaIE._extract_urls(webpage) + if not wistia_urls: if any(re.search(p, webpage) for p in ( r'class=["\']lecture-contents-locked', r'>\s*Lecture contents locked', - r'id=["\']lecture-locked')): + r'id=["\']lecture-locked', + # https://academy.tailoredtutors.co.uk/courses/108779/lectures/1955313 + r'class=["\'](?:inner-)?lesson-locked', + r'>LESSON LOCKED<')): self.raise_login_required('Lecture contents locked') + raise ExtractorError('Unable to find video URL') title = self._og_search_title(webpage, default=None) - return { + chapter = None + chapter_number = None + section_item = self._search_regex( + r'(?s)(?P<li><li[^>]+\bdata-lecture-id=["\']%s[^>]+>.+?</li>)' % video_id, + webpage, 'section item', default=None, group='li') + if section_item: + chapter_number = int_or_none(self._search_regex( + r'data-ss-position=["\'](\d+)', section_item, 'section id', + default=None)) + if chapter_number is not None: + sections = [] + for s in re.findall( + r'(?s)<div[^>]+\bclass=["\']section-title[^>]+>(.+?)</div>', webpage): + section = strip_or_none(clean_html(s)) + if not section: + sections = [] + break + sections.append(section) + if chapter_number <= len(sections): + chapter = sections[chapter_number - 1] + + entries = [{ '_type': 'url_transparent', 'url': wistia_url, 'ie_key': WistiaIE.ie_key(), 'title': title, - } + 'chapter': chapter, + 'chapter_number': chapter_number, + } for wistia_url in wistia_urls] + + return self.playlist_result(entries, video_id, title) class TeachableCourseIE(TeachableBaseIE): @@ -187,20 +219,20 @@ class TeachableCourseIE(TeachableBaseIE): /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+) ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', + 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/', 'info_dict': { 'id': 'essential-web-developer-course', 'title': 'The Essential Web Developer Course (Free)', }, 'playlist_count': 192, }, { - 'url': 'http://upskillcourses.com/courses/119763/', + 'url': 'http://v1.upskillcourses.com/courses/119763/', 'only_matching': True, }, { - 'url': 'http://upskillcourses.com/courses/enrolled/119763', + 'url': 'http://v1.upskillcourses.com/courses/enrolled/119763', 'only_matching': True, }, { - 'url': 'https://academy.gns3.com/courses/enrolled/423415', + 'url': 'https://gns3.teachable.com/courses/enrolled/423415', 'only_matching': True, }, { 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py index e89759714..624cdb3ad 100644 --- a/youtube_dl/extractor/teachingchannel.py +++ b/youtube_dl/extractor/teachingchannel.py @@ -1,35 +1,33 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .ooyala import OoyalaIE class TeachingChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?teachingchannel\.org/videos/(?P<title>.+)' + _VALID_URL = r'https?://(?:www\.)?teachingchannel\.org/videos?/(?P<id>[^/?&#]+)' _TEST = { 'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution', - 'md5': '3d6361864d7cac20b57c8784da17166f', 'info_dict': { - 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', + 'id': '3swwlzkT', 'ext': 'mp4', 'title': 'A History of Teaming', 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', - 'duration': 422.255, + 'duration': 422, + 'upload_date': '20170316', + 'timestamp': 1489691297, }, 'params': { 'skip_download': True, }, - 'add_ie': ['Ooyala'], + 'add_ie': ['JWPlatform'], } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - ooyala_code = self._search_regex( - r'data-embed-code=\'(.+?)\'', webpage, 'ooyala code') + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + mid = self._search_regex( + r'(?:data-mid=["\']|id=["\']jw-video-player-)([a-zA-Z0-9]{8})', + webpage, 'media id') - return OoyalaIE._build_url_result(ooyala_code) + return self.url_result('jwplatform:' + mid, 'JWPlatform', mid) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 33a72083b..364556a1f 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -1,9 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .jwplatform import JWPlatformIE from .nexx import NexxIE -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + NO_DEFAULT, + try_get, +) class Tele5IE(InfoExtractor): @@ -44,14 +54,49 @@ class Tele5IE(InfoExtractor): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - if not video_id: + NEXX_ID_RE = r'\d{6,}' + JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' + + def nexx_result(nexx_id): + return self.url_result( + 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, + ie=NexxIE.ie_key(), video_id=nexx_id) + + nexx_id = jwplatform_id = None + + if video_id: + if re.match(NEXX_ID_RE, video_id): + return nexx_result(video_id) + elif re.match(JWPLATFORM_ID_RE, video_id): + jwplatform_id = video_id + + if not nexx_id: display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)', - r'\s+id\s*=\s*["\']player_(\d{6,})', - r'\bdata-id\s*=\s*["\'](\d{6,})'), webpage, 'video id') + + def extract_id(pattern, name, default=NO_DEFAULT): + return self._html_search_regex( + (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, + r'\s+id\s*=\s*["\']player_(%s)' % pattern, + r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, + default=default) + + nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) + if nexx_id: + return nexx_result(nexx_id) + + if not jwplatform_id: + jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') + + media = self._download_json( + 'https://cdn.jwplayer.com/v2/media/' + jwplatform_id, + display_id) + nexx_id = try_get( + media, lambda x: x['playlist'][0]['nexx_id'], compat_str) + + if nexx_id: + return nexx_result(nexx_id) return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id, - ie=NexxIE.ie_key(), video_id=video_id) + 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + video_id=jwplatform_id) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index d37e1b055..9ba3da341 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -11,6 +11,7 @@ from ..utils import ( determine_ext, int_or_none, str_or_none, + try_get, urljoin, ) @@ -24,7 +25,7 @@ class TelecincoIE(InfoExtractor): 'info_dict': { 'id': '1876350223', 'title': 'Bacalao con kokotxas al pil-pil', - 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', + 'description': 'md5:716caf5601e25c3c5ab6605b1ae71529', }, 'playlist': [{ 'md5': 'adb28c37238b675dad0f042292f209a7', @@ -55,6 +56,26 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477', 'duration': 50, }, + }, { + # video in opening's content + 'url': 'https://www.telecinco.es/vivalavida/fiorella-sobrina-edmundo-arrocet-entrevista_18_2907195140.html', + 'info_dict': { + 'id': '2907195140', + 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', + 'description': 'md5:73f340a7320143d37ab895375b2bf13a', + }, + 'playlist': [{ + 'md5': 'adb28c37238b675dad0f042292f209a7', + 'info_dict': { + 'id': 'TpI2EttSDAReWpJ1o0NVh2', + 'ext': 'mp4', + 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', + 'duration': 1015, + }, + }], + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', 'only_matching': True, @@ -135,17 +156,28 @@ class TelecincoIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) article = self._parse_json(self._search_regex( - r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})', + r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})', webpage, 'article'), display_id)['article'] title = article.get('title') - description = clean_html(article.get('leadParagraph')) + description = clean_html(article.get('leadParagraph')) or '' if article.get('editorialType') != 'VID': entries = [] - for p in article.get('body', []): - content = p.get('content') - if p.get('type') != 'video' or not content: + body = [article.get('opening')] + body.extend(try_get(article, lambda x: x['body'], list) or []) + for p in body: + if not isinstance(p, dict): continue - entries.append(self._parse_content(content, url)) + content = p.get('content') + if not content: + continue + type_ = p.get('type') + if type_ == 'paragraph': + content_str = str_or_none(content) + if content_str: + description += content_str + continue + if type_ == 'video' and isinstance(content, dict): + entries.append(self._parse_content(content, url)) return self.playlist_result( entries, str_or_none(article.get('id')), title, description) content = article['opening']['content'] diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index ae9f66787..c82c94b3a 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -38,8 +38,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'ext': 'mp4', 'title': 'Un petit choc et puis repart!', 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', - 'upload_date': '20180222', - 'timestamp': 1519326631, }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/tfo.py b/youtube_dl/extractor/tfo.py index 0e2370cd8..0631cb7ab 100644 --- a/youtube_dl/extractor/tfo.py +++ b/youtube_dl/extractor/tfo.py @@ -17,14 +17,12 @@ class TFOIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P<id>\d+)' _TEST = { 'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon', - 'md5': '47c987d0515561114cf03d1226a9d4c7', + 'md5': 'cafbe4f47a8dae0ca0159937878100d6', 'info_dict': { - 'id': '100463871', + 'id': '7da3d50e495c406b8fc0b997659cc075', 'ext': 'mp4', 'title': 'Video Game Hackathon', 'description': 'md5:558afeba217c6c8d96c60e5421795c07', - 'upload_date': '20160212', - 'timestamp': 1455310233, } } diff --git a/youtube_dl/extractor/thisoldhouse.py b/youtube_dl/extractor/thisoldhouse.py index 6ab147ad7..387f955ee 100644 --- a/youtube_dl/extractor/thisoldhouse.py +++ b/youtube_dl/extractor/thisoldhouse.py @@ -2,43 +2,42 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str -from ..utils import try_get class ThisOldHouseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/]+/)?\d+)/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', - 'md5': '568acf9ca25a639f0c4ff905826b662f', 'info_dict': { - 'id': '2REGtUDQ', + 'id': '5dcdddf673c3f956ef5db202', 'ext': 'mp4', 'title': 'How to Build a Storage Bench', 'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.', 'timestamp': 1442548800, 'upload_date': '20150918', - } + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins', 'only_matching': True, }, { 'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric', 'only_matching': True, + }, { + 'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench', + 'only_matching': True, + }, { + 'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost', + 'only_matching': True, }] + _ZYPE_TMPL = 'https://player.zype.com/embed/%s.html?api_key=hsOk_yMSPYNrT22e9pu8hihLXjaZf0JW5jsOWv4ZqyHJFvkJn6rtToHl09tbbsbe' def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( - (r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1', - r'id=(["\'])inline-video-player-(?P<id>(?:(?!\1).)+)\1'), - webpage, 'video id', default=None, group='id') - if not video_id: - drupal_settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), display_id) - video_id = try_get( - drupal_settings, lambda x: x['jwplatform']['video_id'], - compat_str) or list(drupal_settings['comScore'])[0] - return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id) + r'<iframe[^>]+src=[\'"](?:https?:)?//thisoldhouse\.chorus\.build/videos/zype/([0-9a-f]{24})', + webpage, 'video id') + return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id) diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index 5e5efda0f..ca2e36efe 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -17,9 +17,9 @@ from ..utils import ( class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', + 'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { 'id': '343115', 'ext': 'mp4', @@ -33,7 +33,7 @@ class ToggleIE(InfoExtractor): } }, { 'note': 'DRM-protected video', - 'url': 'http://video.toggle.sg/en/movies/dug-s-special-mission/341413', + 'url': 'http://www.mewatch.sg/en/movies/dug-s-special-mission/341413', 'info_dict': { 'id': '341413', 'ext': 'wvm', @@ -48,7 +48,7 @@ class ToggleIE(InfoExtractor): }, { # this also tests correct video id extraction 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', - 'url': 'http://video.toggle.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', + 'url': 'http://www.mewatch.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', 'info_dict': { 'id': '332861', 'ext': 'mp4', @@ -65,19 +65,22 @@ class ToggleIE(InfoExtractor): 'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', 'only_matching': True, }, { - 'url': 'http://video.toggle.sg/zh/series/zero-calling-s2-hd/ep13/336367', + 'url': 'http://www.mewatch.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', 'only_matching': True, }, { - 'url': 'http://video.toggle.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', + 'url': 'http://www.mewatch.sg/zh/series/zero-calling-s2-hd/ep13/336367', 'only_matching': True, }, { - 'url': 'http://video.toggle.sg/en/movies/seven-days/321936', + 'url': 'http://www.mewatch.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', 'only_matching': True, }, { - 'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', + 'url': 'http://www.mewatch.sg/en/movies/seven-days/321936', 'only_matching': True, }, { - 'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585', + 'url': 'https://www.mewatch.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', + 'only_matching': True, + }, { + 'url': 'http://www.mewatch.sg/en/channels/eleven-plus/401585', 'only_matching': True, }] diff --git a/youtube_dl/extractor/trunews.py b/youtube_dl/extractor/trunews.py index b0c7caabf..cca5b5ceb 100644 --- a/youtube_dl/extractor/trunews.py +++ b/youtube_dl/extractor/trunews.py @@ -1,21 +1,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - dict_get, - float_or_none, - int_or_none, - unified_timestamp, - update_url_query, - url_or_none, -) class TruNewsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?trunews\.com/stream/(?P<id>[^/?#&]+)' _TEST = { 'url': 'https://www.trunews.com/stream/will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', - 'md5': 'a19c024c3906ff954fac9b96ce66bb08', 'info_dict': { 'id': '5c5a21e65d3c196e1c0020cc', 'display_id': 'will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', @@ -28,48 +19,16 @@ class TruNewsIE(InfoExtractor): }, 'add_ie': ['Zype'], } + _ZYPE_TEMPL = 'https://player.zype.com/embed/%s.js?api_key=X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt' def _real_extract(self, url): display_id = self._match_id(url) - video = self._download_json( + zype_id = self._download_json( 'https://api.zype.com/videos', display_id, query={ 'app_key': 'PUVKp9WgGUb3-JUw6EqafLx8tFVP6VKZTWbUOR-HOm__g4fNDt1bCsm_LgYf_k9H', 'per_page': 1, 'active': 'true', 'friendly_title': display_id, - })['response'][0] - - zype_id = video['_id'] - - thumbnails = [] - thumbnails_list = video.get('thumbnails') - if isinstance(thumbnails_list, list): - for thumbnail in thumbnails_list: - if not isinstance(thumbnail, dict): - continue - thumbnail_url = url_or_none(thumbnail.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - '_type': 'url_transparent', - 'url': update_url_query( - 'https://player.zype.com/embed/%s.js' % zype_id, - {'api_key': 'X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt'}), - 'ie_key': 'Zype', - 'id': zype_id, - 'display_id': display_id, - 'title': video.get('title'), - 'description': dict_get(video, ('description', 'ott_description', 'short_description')), - 'duration': int_or_none(video.get('duration')), - 'timestamp': unified_timestamp(video.get('published_at')), - 'average_rating': float_or_none(video.get('rating')), - 'view_count': int_or_none(video.get('request_count')), - 'thumbnails': thumbnails, - } + })['response'][0]['_id'] + return self.url_result(self._ZYPE_TEMPL % zype_id, 'Zype', zype_id) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index edbb0aa69..ae584ad69 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -151,7 +150,7 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): raise ExtractorError( 'This Tumblr may contain sensitive media. ' diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 1b6590767..4a19b9be6 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -4,13 +4,16 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( determine_ext, + ExtractorError, int_or_none, float_or_none, js_to_json, parse_iso8601, remove_end, + strip_or_none, try_get, ) @@ -21,7 +24,7 @@ class TV2IE(InfoExtractor): 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { 'id': '916509', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', 'timestamp': 1431715610, @@ -30,21 +33,32 @@ class TV2IE(InfoExtractor): 'view_count': int, 'categories': list, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, } + _API_DOMAIN = 'sumo.tv2.no' + _PROTOCOLS = ('HDS', 'HLS', 'DASH') + _GEO_COUNTRIES = ['NO'] def _real_extract(self, url): video_id = self._match_id(url) + api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id) formats = [] format_urls = [] - for protocol in ('HDS', 'HLS'): - data = self._download_json( - 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol), - video_id, 'Downloading play JSON')['playback'] + for protocol in self._PROTOCOLS: + try: + data = self._download_json( + api_base + '/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % protocol, + video_id, 'Downloading play JSON')['playback'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), video_id)['error'] + error_code = error.get('code') + if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + elif error_code == 'SESSION_NOT_AUTHENTICATED': + self.raise_login_required() + raise ExtractorError(error['description']) + raise items = try_get(data, lambda x: x['items']['item']) if not items: continue @@ -65,9 +79,13 @@ class TV2IE(InfoExtractor): formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + if not data.get('drmProtected'): + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, format_id, fatal=False)) elif ext == 'ism' or video_url.endswith('.ism/Manifest'): pass else: @@ -77,34 +95,30 @@ class TV2IE(InfoExtractor): 'tbr': int_or_none(item.get('bitrate')), 'filesize': int_or_none(item.get('fileSize')), }) + if not formats and data.get('drmProtected'): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) asset = self._download_json( - 'http://sumo.tv2.no/api/web/asset/%s.json' % video_id, - video_id, 'Downloading metadata JSON')['asset'] - + api_base + '.json', video_id, + 'Downloading metadata JSON')['asset'] title = asset['title'] - description = asset.get('description') - timestamp = parse_iso8601(asset.get('createTime')) - duration = float_or_none(asset.get('accurateDuration') or asset.get('duration')) - view_count = int_or_none(asset.get('views')) - categories = asset.get('keywords', '').split(',') thumbnails = [{ 'id': thumbnail.get('@type'), 'url': thumbnail.get('url'), - } for _, thumbnail in asset.get('imageVersions', {}).items()] + } for _, thumbnail in (asset.get('imageVersions') or {}).items()] return { 'id': video_id, 'url': video_url, 'title': title, - 'description': description, + 'description': strip_or_none(asset.get('description')), 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'categories': categories, + 'timestamp': parse_iso8601(asset.get('createTime')), + 'duration': float_or_none(asset.get('accurateDuration') or asset.get('duration')), + 'view_count': int_or_none(asset.get('views')), + 'categories': asset.get('keywords', '').split(','), 'formats': formats, } @@ -116,7 +130,7 @@ class TV2ArticleIE(InfoExtractor): 'info_dict': { 'id': '6930542', 'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret', - 'description': 'md5:339573779d3eea3542ffe12006190954', + 'description': 'De fire siktede nekter fortsatt for å ha stjålet pingvinbabyene, men innrømmer å ha åpnet luken til de små kyllingene.', }, 'playlist_count': 2, }, { @@ -134,7 +148,7 @@ class TV2ArticleIE(InfoExtractor): if not assets: # New embed pattern - for v in re.findall(r'TV2ContentboxVideo\(({.+?})\)', webpage): + for v in re.findall(r'(?s)TV2ContentboxVideo\(({.+?})\)', webpage): video = self._parse_json( v, playlist_id, transform_source=js_to_json, fatal=False) if not video: @@ -151,3 +165,28 @@ class TV2ArticleIE(InfoExtractor): description = remove_end(self._og_search_description(webpage), ' - TV2.no') return self.playlist_result(entries, playlist_id, title, description) + + +class KatsomoIE(TV2IE): + _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P<id>\d+)' + _TEST = { + 'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321', + 'info_dict': { + 'id': '1181321', + 'ext': 'mp4', + 'title': 'MTV Uutiset Live', + 'description': 'Päätöksen teki Pelicansin hallitus.', + 'timestamp': 1575116484, + 'upload_date': '20191130', + 'duration': 37.12, + 'view_count': int, + 'categories': list, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + _API_DOMAIN = 'api.katsomo.fi' + _PROTOCOLS = ('HLS', 'MPD') + _GEO_COUNTRIES = ['FI'] diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index eb39424df..8bda9348d 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -1,10 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor -from ..utils import extract_attributes +from ..utils import ( + determine_ext, + extract_attributes, + js_to_json, + url_or_none, +) class TV2DKIE(InfoExtractor): @@ -80,3 +86,69 @@ class TV2DKIE(InfoExtractor): 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', video_id=kaltura_id)) return self.playlist_result(entries) + + +class TV2DKBornholmPlayIE(InfoExtractor): + _VALID_URL = r'https?://play\.tv2bornholm\.dk/\?.*?\bid=(?P<id>\d+)' + _TEST = { + 'url': 'http://play.tv2bornholm.dk/?area=specifikTV&id=781021', + 'info_dict': { + 'id': '781021', + 'ext': 'mp4', + 'title': '12Nyheder-27.11.19', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id, + data=json.dumps({ + 'playlist_id': video_id, + 'serienavn': '', + }).encode(), headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'Content-Type': 'application/json; charset=UTF-8', + })['d'] + + # TODO: generalize flowplayer + title = self._search_regex( + r'title\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', video, 'title', + group='value') + sources = self._parse_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', video, 'sources'), + video_id, js_to_json) + + formats = [] + srcs = set() + for source in sources: + src = url_or_none(source.get('src')) + if not src: + continue + if src in srcs: + continue + srcs.add(src) + ext = determine_ext(src) + src_type = source.get('type') + if src_type == 'application/x-mpegurl' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif src_type == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index a819d048c..c498b0191 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -99,7 +99,7 @@ class TV4IE(InfoExtractor): manifest_url.replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) formats.extend(self._extract_ism_formats( - re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), + re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url), video_id, ism_id='mss', fatal=False)) if not formats and info.get('is_geo_restricted'): diff --git a/youtube_dl/extractor/tv5mondeplus.py b/youtube_dl/extractor/tv5mondeplus.py index 88b6baa31..b7fe082b9 100644 --- a/youtube_dl/extractor/tv5mondeplus.py +++ b/youtube_dl/extractor/tv5mondeplus.py @@ -3,31 +3,51 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - clean_html, determine_ext, extract_attributes, - get_element_by_class, int_or_none, parse_duration, - parse_iso8601, ) class TV5MondePlusIE(InfoExtractor): IE_DESC = 'TV5MONDE+' - _VALID_URL = r'https?://(?:www\.)?tv5mondeplus\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.tv5mondeplus.com/toutes-les-videos/documentaire/tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', - 'md5': '12130fc199f020673138a83466542ec6', + _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' + _TESTS = [{ + # movie + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/rendez-vous-a-atlit', + 'md5': '8cbde5ea7b296cf635073e27895e227f', 'info_dict': { - 'id': 'tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', + 'id': '822a4756-0712-7329-1859-a13ac7fd1407', + 'display_id': 'rendez-vous-a-atlit', 'ext': 'mp4', - 'title': 'Tdah, mon amour - Enfants', - 'description': 'md5:230e3aca23115afcf8006d1bece6df74', - 'upload_date': '20170401', - 'timestamp': 1491022860, - } - } + 'title': 'Rendez-vous à Atlit', + 'description': 'md5:2893a4c5e1dbac3eedff2d87956e4efb', + 'upload_date': '20200130', + }, + }, { + # series episode + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/c-est-la-vie-ennemie-juree', + 'info_dict': { + 'id': '0df7007c-4900-3936-c601-87a13a93a068', + 'display_id': 'c-est-la-vie-ennemie-juree', + 'ext': 'mp4', + 'title': "C'est la vie - Ennemie jurée", + 'description': 'md5:dfb5c63087b6f35fe0cc0af4fe44287e', + 'upload_date': '20200130', + 'series': "C'est la vie", + 'episode': 'Ennemie jurée', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver', + 'only_matching': True, + }, { + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/info-societe/le-journal-de-la-rts-edition-du-30-01-20-19h30', + 'only_matching': True, + }] _GEO_BYPASS = False def _real_extract(self, url): @@ -37,11 +57,7 @@ class TV5MondePlusIE(InfoExtractor): if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: self.raise_geo_restricted(countries=['FR']) - series = get_element_by_class('video-detail__title', webpage) - title = episode = get_element_by_class( - 'video-detail__subtitle', webpage) or series - if series and series != title: - title = '%s - %s' % (series, title) + title = episode = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title') vpl_data = extract_attributes(self._search_regex( r'(<[^>]+class="video_player_loader"[^>]+>)', webpage, 'video player loader')) @@ -65,15 +81,37 @@ class TV5MondePlusIE(InfoExtractor): }) self._sort_formats(formats) + description = self._html_search_regex( + r'(?s)<div[^>]+class=["\']episode-texte[^>]+>(.+?)</div>', webpage, + 'description', fatal=False) + + series = self._html_search_regex( + r'<p[^>]+class=["\']episode-emission[^>]+>([^<]+)', webpage, + 'series', default=None) + + if series and series != title: + title = '%s - %s' % (series, title) + + upload_date = self._search_regex( + r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})', + webpage, 'upload date', default=None) + if upload_date: + upload_date = upload_date.replace('_', '') + + video_id = self._search_regex( + (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', + default=display_id) + return { - 'id': display_id, + 'id': video_id, 'display_id': display_id, 'title': title, - 'description': clean_html(get_element_by_class('video-detail__description', webpage)), + 'description': description, 'thumbnail': vpl_data.get('data-image'), 'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)), - 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)), + 'upload_date': upload_date, 'formats': formats, - 'episode': episode, 'series': series, + 'episode': episode, } diff --git a/youtube_dl/extractor/tva.py b/youtube_dl/extractor/tva.py index 0b863df2f..443f46e8a 100644 --- a/youtube_dl/extractor/tva.py +++ b/youtube_dl/extractor/tva.py @@ -9,8 +9,8 @@ from ..utils import ( class TVAIE(InfoExtractor): - _VALID_URL = r'https?://videos\.tva\.ca/details/_(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://videos?\.tva\.ca/details/_(?P<id>\d+)' + _TESTS = [{ 'url': 'https://videos.tva.ca/details/_5596811470001', 'info_dict': { 'id': '5596811470001', @@ -24,7 +24,10 @@ class TVAIE(InfoExtractor): # m3u8 download 'skip_download': True, } - } + }, { + 'url': 'https://video.tva.ca/details/_5596811470001', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 1d66eeaff..74d14049b 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -17,8 +17,8 @@ class TwentyFourVideoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?P<host> - (?:(?:www|porno)\.)?24video\. - (?:net|me|xxx|sexy?|tube|adult|site) + (?:(?:www|porno?)\.)?24video\. + (?:net|me|xxx|sexy?|tube|adult|site|vip) )/ (?: video/(?:(?:view|xml)/)?| @@ -59,6 +59,12 @@ class TwentyFourVideoIE(InfoExtractor): }, { 'url': 'https://porno.24video.net/video/2640421-vsya-takaya-gibkaya-i-v-masle', 'only_matching': True, + }, { + 'url': 'https://www.24video.vip/video/view/1044982', + 'only_matching': True, + }, { + 'url': 'https://porn.24video.net/video/2640421-vsya-takay', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 8c0d70010..78ee0115c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -17,12 +17,10 @@ from ..compat import ( from ..utils import ( clean_html, ExtractorError, - float_or_none, int_or_none, orderedSet, parse_duration, parse_iso8601, - qualities, try_get, unified_timestamp, update_url_query, @@ -327,6 +325,7 @@ class TwitchVodIE(TwitchItemBaseIE): 'allow_audio_only': 'true', 'allow_spectre': 'true', 'player': 'twitchweb', + 'playlist_include_framerate': 'true', 'nauth': access_token['token'], 'nauthsig': access_token['sig'], })), @@ -576,8 +575,8 @@ class TwitchStreamIE(TwitchBaseIE): channel_id = self._match_id(url) stream = self._call_api( - 'kraken/streams/%s?stream_type=all' % channel_id, channel_id, - 'Downloading stream JSON').get('stream') + 'kraken/streams/%s?stream_type=all' % channel_id.lower(), + channel_id, 'Downloading stream JSON').get('stream') if not stream: raise ExtractorError('%s is offline' % channel_id, expected=True) @@ -598,6 +597,7 @@ class TwitchStreamIE(TwitchBaseIE): 'allow_spectre': 'true', 'p': random.randint(1000000, 10000000), 'player': 'twitchweb', + 'playlist_include_framerate': 'true', 'segment_preference': '4', 'sig': access_token['sig'].encode('utf-8'), 'token': access_token['token'].encode('utf-8'), @@ -643,7 +643,14 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)| + (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/ + ) + (?P<id>[^/?#&]+) + ''' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -669,68 +676,92 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', + 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - status = self._download_json( - 'https://clips.twitch.tv/api/v2/clips/%s/status' % video_id, - video_id) + clip = self._download_json( + 'https://gql.twitch.tv/gql', video_id, data=json.dumps({ + 'query': '''{ + clip(slug: "%s") { + broadcaster { + displayName + } + createdAt + curator { + displayName + id + } + durationSeconds + id + tiny: thumbnailURL(width: 86, height: 45) + small: thumbnailURL(width: 260, height: 147) + medium: thumbnailURL(width: 480, height: 272) + title + videoQualities { + frameRate + quality + sourceURL + } + viewCount + } +}''' % video_id, + }).encode(), headers={ + 'Client-ID': self._CLIENT_ID, + })['data']['clip'] + + if not clip: + raise ExtractorError( + 'This clip is no longer available', expected=True) formats = [] - - for option in status['quality_options']: + for option in clip.get('videoQualities', []): if not isinstance(option, dict): continue - source = url_or_none(option.get('source')) + source = url_or_none(option.get('sourceURL')) if not source: continue formats.append({ 'url': source, 'format_id': option.get('quality'), 'height': int_or_none(option.get('quality')), - 'fps': int_or_none(option.get('frame_rate')), + 'fps': int_or_none(option.get('frameRate')), }) - self._sort_formats(formats) - info = { + thumbnails = [] + for thumbnail_id in ('tiny', 'small', 'medium'): + thumbnail_url = clip.get(thumbnail_id) + if not thumbnail_url: + continue + thumb = { + 'id': thumbnail_id, + 'url': thumbnail_url, + } + mobj = re.search(r'-(\d+)x(\d+)\.', thumbnail_url) + if mobj: + thumb.update({ + 'height': int(mobj.group(2)), + 'width': int(mobj.group(1)), + }) + thumbnails.append(thumb) + + return { + 'id': clip.get('id') or video_id, + 'title': clip.get('title') or video_id, 'formats': formats, + 'duration': int_or_none(clip.get('durationSeconds')), + 'views': int_or_none(clip.get('viewCount')), + 'timestamp': unified_timestamp(clip.get('createdAt')), + 'thumbnails': thumbnails, + 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str), + 'uploader': try_get(clip, lambda x: x['curator']['displayName'], compat_str), + 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), } - - clip = self._call_api( - 'kraken/clips/%s' % video_id, video_id, fatal=False, headers={ - 'Accept': 'application/vnd.twitchtv.v5+json', - }) - - if clip: - quality_key = qualities(('tiny', 'small', 'medium')) - thumbnails = [] - thumbnails_dict = clip.get('thumbnails') - if isinstance(thumbnails_dict, dict): - for thumbnail_id, thumbnail_url in thumbnails_dict.items(): - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'preference': quality_key(thumbnail_id), - }) - - info.update({ - 'id': clip.get('tracking_id') or video_id, - 'title': clip.get('title') or video_id, - 'duration': float_or_none(clip.get('duration')), - 'views': int_or_none(clip.get('views')), - 'timestamp': unified_timestamp(clip.get('created_at')), - 'thumbnails': thumbnails, - 'creator': try_get(clip, lambda x: x['broadcaster']['display_name'], compat_str), - 'uploader': try_get(clip, lambda x: x['curator']['display_name'], compat_str), - 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), - }) - else: - info.update({ - 'title': video_id, - 'id': video_id, - }) - - return info diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 5f8d90fb4..01468981c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -251,10 +251,10 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'Simon Vertugo - BEAT PROD: @suhmeduh #Damndaniel', + 'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel', 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'Simon Vertugo', + 'uploader': 'simon vetugo', 'uploader_id': 'simonvertugo', 'duration': 30.0, 'timestamp': 1455777459, @@ -376,6 +376,10 @@ class TwitterIE(TwitterBaseIE): # Twitch Clip Embed 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', 'only_matching': True, + }, { + # promo_video_website card + 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', + 'only_matching': True, }] def _real_extract(self, url): @@ -458,10 +462,11 @@ class TwitterIE(TwitterBaseIE): return try_get(o, lambda x: x[x['type'].lower() + '_value']) card_name = card['name'].split(':')[-1] - if card_name == 'amplify': - formats = self._extract_formats_from_vmap_url( - get_binding_value('amplify_url_vmap'), - get_binding_value('amplify_content_id') or twid) + if card_name in ('amplify', 'promo_video_website'): + is_amplify = card_name == 'amplify' + vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') + content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) + formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) self._sort_formats(formats) thumbnails = [] diff --git a/youtube_dl/extractor/ufctv.py b/youtube_dl/extractor/ufctv.py index f3eaee6b3..3d74ba071 100644 --- a/youtube_dl/extractor/ufctv.py +++ b/youtube_dl/extractor/ufctv.py @@ -1,73 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, - parse_iso8601, - urlencode_postdata, -) +from .imggaming import ImgGamingBaseIE -class UFCTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ufc\.tv/video/(?P<id>[^/]+)' +class UFCTVIE(ImgGamingBaseIE): + _VALID_URL = ImgGamingBaseIE._VALID_URL_TEMPL % r'(?:(?:app|www)\.)?(?:ufc\.tv|(?:ufc)?fightpass\.com)|ufcfightpass\.img(?:dge|gaming)\.com' _NETRC_MACHINE = 'ufctv' - _TEST = { - 'url': 'https://www.ufc.tv/video/ufc-219-countdown-full-episode', - 'info_dict': { - 'id': '34167', - 'ext': 'mp4', - 'title': 'UFC 219 Countdown: Full Episode', - 'description': 'md5:26d4e8bf4665ae5878842d7050c3c646', - 'timestamp': 1513962360, - 'upload_date': '20171222', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - } + _REALM = 'ufc' - def _real_initialize(self): - username, password = self._get_login_info() - if username is None: - return - code = self._download_json( - 'https://www.ufc.tv/secure/authenticate', - None, 'Logging in', data=urlencode_postdata({ - 'username': username, - 'password': password, - 'format': 'json', - })).get('code') - if code and code != 'loginsuccess': - raise ExtractorError(code, expected=True) - - def _real_extract(self, url): - display_id = self._match_id(url) - video_data = self._download_json(url, display_id, query={ - 'format': 'json', - }) - video_id = str(video_data['id']) - title = video_data['name'] - m3u8_url = self._download_json( - 'https://www.ufc.tv/service/publishpoint', video_id, query={ - 'type': 'video', - 'format': 'json', - 'id': video_id, - }, headers={ - 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1', - })['path'] - m3u8_url = m3u8_url.replace('_iphone.', '.') - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'duration': parse_duration(video_data.get('runtime')), - 'timestamp': parse_iso8601(video_data.get('releaseDate')), - 'formats': formats, - } +class UFCArabiaIE(ImgGamingBaseIE): + _VALID_URL = ImgGamingBaseIE._VALID_URL_TEMPL % r'(?:(?:app|www)\.)?ufcarabia\.(?:ae|com)' + _NETRC_MACHINE = 'ufcarabia' + _REALM = 'admufc' diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 8fdfd743d..e37499512 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -1,35 +1,50 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import time +import functools import hashlib import json import random +import re +import time from .adobepass import AdobePassIE -from .youtube import YoutubeIE from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import ( compat_HTTPError, compat_str, ) from ..utils import ( + clean_html, ExtractorError, int_or_none, + OnDemandPagedList, parse_age_limit, str_or_none, try_get, ) -class ViceIE(AdobePassIE): +class ViceBaseIE(InfoExtractor): + def _call_api(self, resource, resource_key, resource_id, locale, fields, args=''): + return self._download_json( + 'https://video.vice.com/api/v1/graphql', resource_id, query={ + 'query': '''{ + %s(locale: "%s", %s: "%s"%s) { + %s + } +}''' % (resource, locale, resource_key, resource_id, args, fields), + })['data'][resource] + + +class ViceIE(ViceBaseIE, AdobePassIE): IE_NAME = 'vice' - _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?viceland)\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]+)' + _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]{24})' _TESTS = [{ 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7', 'info_dict': { - 'id': '5e647f0125e145c9aef2069412c0cbde', + 'id': '58c69e38a55424f1227dc3f7', 'ext': 'mp4', 'title': '10 Questions You Always Wanted To Ask: Pet Cremator', 'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5', @@ -43,17 +58,16 @@ class ViceIE(AdobePassIE): # m3u8 download 'skip_download': True, }, - 'add_ie': ['UplynkPreplay'], }, { # geo restricted to US 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', 'info_dict': { - 'id': '930c0ad1f47141cc955087eecaddb0e2', + 'id': '5816510690b70e6c5fd39a56', 'ext': 'mp4', - 'uploader': 'waypoint', + 'uploader': 'vice', 'title': 'The Signal From Tölva', 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', - 'uploader_id': '57f7d621e05ca860fa9ccaf9', + 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1477941983, 'upload_date': '20161031', }, @@ -61,15 +75,14 @@ class ViceIE(AdobePassIE): # m3u8 download 'skip_download': True, }, - 'add_ie': ['UplynkPreplay'], }, { 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', 'info_dict': { 'id': '581b12b60a0e1f4c0fb6ea2f', 'ext': 'mp4', 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', - 'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>', - 'uploader': 'VICE', + 'description': 'Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.', + 'uploader': 'vice', 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1485368119, 'upload_date': '20170125', @@ -78,9 +91,7 @@ class ViceIE(AdobePassIE): 'params': { # AES-encrypted m3u8 'skip_download': True, - 'proxy': '127.0.0.1:8118', }, - 'add_ie': ['UplynkPreplay'], }, { 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', 'only_matching': True, @@ -98,7 +109,7 @@ class ViceIE(AdobePassIE): @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]+)', + r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})', webpage) @staticmethod @@ -109,31 +120,16 @@ class ViceIE(AdobePassIE): def _real_extract(self, url): locale, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage( - 'https://video.vice.com/%s/embed/%s' % (locale, video_id), - video_id) - - video = self._parse_json( - self._search_regex( - r'PREFETCH_DATA\s*=\s*({.+?})\s*;\s*\n', webpage, - 'app state'), video_id)['video'] - video_id = video.get('vms_id') or video.get('id') or video_id - title = video['title'] - is_locked = video.get('locked') + video = self._call_api('videos', 'id', video_id, locale, '''body + locked + rating + thumbnail_url + title''')[0] + title = video['title'].strip() rating = video.get('rating') - thumbnail = video.get('thumbnail_url') - duration = int_or_none(video.get('duration')) - series = try_get( - video, lambda x: x['episode']['season']['show']['title'], - compat_str) - episode_number = try_get( - video, lambda x: x['episode']['episode_number']) - season_number = try_get( - video, lambda x: x['episode']['season']['season_number']) - uploader = None query = {} - if is_locked: + if video.get('locked'): resource = self._get_mvpd_resource( 'VICELAND', title, video_id, rating) query['tvetoken'] = self._extract_mvpd_auth( @@ -148,12 +144,9 @@ class ViceIE(AdobePassIE): query.update({ 'exp': exp, 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), - '_ad_blocked': None, - '_ad_unit': '', - '_debug': '', + 'skipadstitching': 1, 'platform': 'desktop', 'rn': random.randint(10000, 100000), - 'fbprebidtoken': '', }) try: @@ -169,85 +162,94 @@ class ViceIE(AdobePassIE): raise video_data = preplay['video'] - base = video_data['base'] - uplynk_preplay_url = preplay['preplayURL'] - episode = video_data.get('episode', {}) - channel = video_data.get('channel', {}) + formats = self._extract_m3u8_formats( + preplay['playURL'], video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + episode = video_data.get('episode') or {} + channel = video_data.get('channel') or {} + season = video_data.get('season') or {} subtitles = {} - cc_url = preplay.get('ccURL') - if cc_url: - subtitles['en'] = [{ + for subtitle in preplay.get('subtitleURLs', []): + cc_url = subtitle.get('url') + if not cc_url: + continue + language_code = try_get(subtitle, lambda x: x['languages'][0]['language_code'], compat_str) or 'en' + subtitles.setdefault(language_code, []).append({ 'url': cc_url, - }] + }) return { - '_type': 'url_transparent', - 'url': uplynk_preplay_url, + 'formats': formats, 'id': video_id, 'title': title, - 'description': base.get('body') or base.get('display_body'), - 'thumbnail': thumbnail, - 'duration': int_or_none(video_data.get('video_duration')) or duration, + 'description': clean_html(video.get('body')), + 'thumbnail': video.get('thumbnail_url'), + 'duration': int_or_none(video_data.get('video_duration')), 'timestamp': int_or_none(video_data.get('created_at'), 1000), - 'age_limit': parse_age_limit(video_data.get('video_rating')), - 'series': video_data.get('show_title') or series, - 'episode_number': int_or_none(episode.get('episode_number') or episode_number), + 'age_limit': parse_age_limit(video_data.get('video_rating') or rating), + 'series': try_get(video_data, lambda x: x['show']['base']['display_title'], compat_str), + 'episode_number': int_or_none(episode.get('episode_number')), 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), - 'season_number': int_or_none(season_number), - 'season_id': str_or_none(episode.get('season_id')), - 'uploader': channel.get('base', {}).get('title') or channel.get('name') or uploader, + 'season_number': int_or_none(season.get('season_number')), + 'season_id': str_or_none(season.get('id') or video_data.get('season_id')), + 'uploader': channel.get('name'), 'uploader_id': str_or_none(channel.get('id')), 'subtitles': subtitles, - 'ie_key': 'UplynkPreplay', } -class ViceShowIE(InfoExtractor): +class ViceShowIE(ViceBaseIE): IE_NAME = 'vice:show' - _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)' - - _TEST = { - 'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2', + _VALID_URL = r'https?://(?:video\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/show/(?P<id>[^/?#&]+)' + _PAGE_SIZE = 25 + _TESTS = [{ + 'url': 'https://video.vice.com/en_us/show/fck-thats-delicious', 'info_dict': { - 'id': 'fuck-thats-delicious-2', - 'title': "Fuck, That's Delicious", - 'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.', + 'id': '57a2040c8cb727dec794c901', + 'title': 'F*ck, That’s Delicious', + 'description': 'The life and eating habits of rap’s greatest bon vivant, Action Bronson.', }, - 'playlist_count': 17, - } + 'playlist_mincount': 64, + }, { + 'url': 'https://www.vicetv.com/en_us/show/fck-thats-delicious', + 'only_matching': True, + }] + + def _fetch_page(self, locale, show_id, page): + videos = self._call_api('videos', 'show_id', show_id, locale, '''body + id + url''', ', page: %d, per_page: %d' % (page + 1, self._PAGE_SIZE)) + for video in videos: + yield self.url_result( + video['url'], ViceIE.ie_key(), video.get('id')) def _real_extract(self, url): - show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) + locale, display_id = re.match(self._VALID_URL, url).groups() + show = self._call_api('shows', 'slug', display_id, locale, '''dek + id + title''')[0] + show_id = show['id'] - entries = [ - self.url_result(video_url, ViceIE.ie_key()) - for video_url, _ in re.findall( - r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"' - % ViceIE._VALID_URL, webpage)] + entries = OnDemandPagedList( + functools.partial(self._fetch_page, locale, show_id), + self._PAGE_SIZE) - title = self._search_regex( - r'<title>(.+?)', webpage, 'title', default=None) - if title: - title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() - description = self._html_search_meta( - 'description', webpage, 'description') - - return self.playlist_result(entries, show_id, title, description) + return self.playlist_result( + entries, show_id, show.get('title'), show.get('dek')) -class ViceArticleIE(InfoExtractor): +class ViceArticleIE(ViceBaseIE): IE_NAME = 'vice:article' - _VALID_URL = r'https://www\.vice\.com/[^/]+/article/(?P[^?#]+)' + _VALID_URL = r'https://(?:www\.)?vice\.com/(?P[^/]+)/article/(?:[0-9a-z]{6}/)?(?P[^?#]+)' _TESTS = [{ 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', 'info_dict': { - 'id': '41eae2a47b174a1398357cec55f1f6fc', + 'id': '58dc0a3dee202d2a0ccfcbd8', 'ext': 'mp4', - 'title': 'Mormon War on Porn ', - 'description': 'md5:6394a8398506581d0346b9ab89093fef', + 'title': 'Mormon War on Porn', + 'description': 'md5:1c5d91fe25fa8aa304f9def118b92dbf', 'uploader': 'vice', 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1491883129, @@ -258,10 +260,10 @@ class ViceArticleIE(InfoExtractor): # AES-encrypted m3u8 'skip_download': True, }, - 'add_ie': ['UplynkPreplay'], + 'add_ie': [ViceIE.ie_key()], }, { 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', - 'md5': '7fe8ebc4fa3323efafc127b82bd821d9', + 'md5': '13010ee0bc694ea87ec40724397c2349', 'info_dict': { 'id': '3jstaBeXgAs', 'ext': 'mp4', @@ -271,15 +273,15 @@ class ViceArticleIE(InfoExtractor): 'uploader_id': 'MotherboardTV', 'upload_date': '20140529', }, - 'add_ie': ['Youtube'], + 'add_ie': [YoutubeIE.ie_key()], }, { 'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded', 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', 'info_dict': { - 'id': 'e2ed435eb67e43efb66e6ef9a6930a88', + 'id': '57f41d3556a0a80f54726060', 'ext': 'mp4', 'title': "Making The World's First Male Sex Doll", - 'description': 'md5:916078ef0e032d76343116208b6cc2c4', + 'description': 'md5:19b00b215b99961cf869c40fbe9df755', 'uploader': 'vice', 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1476919911, @@ -288,6 +290,7 @@ class ViceArticleIE(InfoExtractor): }, 'params': { 'skip_download': True, + 'format': 'bestvideo', }, 'add_ie': [ViceIE.ie_key()], }, { @@ -299,14 +302,11 @@ class ViceArticleIE(InfoExtractor): }] def _real_extract(self, url): - display_id = self._match_id(url) + locale, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) - - prefetch_data = self._parse_json(self._search_regex( - r'__APP_STATE\s*=\s*({.+?})(?:\s*\|\|\s*{}\s*)?;\s*\n', - webpage, 'app state'), display_id)['pageData'] - body = prefetch_data['body'] + article = self._call_api('articles', 'slug', display_id, locale, '''body + embed_code''')[0] + body = article['body'] def _url_res(video_url, ie_key): return { @@ -316,7 +316,7 @@ class ViceArticleIE(InfoExtractor): 'ie_key': ie_key, } - vice_url = ViceIE._extract_url(webpage) + vice_url = ViceIE._extract_url(body) if vice_url: return _url_res(vice_url, ViceIE.ie_key()) @@ -332,6 +332,6 @@ class ViceArticleIE(InfoExtractor): video_url = self._html_search_regex( r'data-video-url="([^"]+)"', - prefetch_data['embed_code'], 'video URL') + article['embed_code'], 'video URL') return _url_res(video_url, ViceIE.ie_key()) diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py deleted file mode 100644 index cf690d7b0..000000000 --- a/youtube_dl/extractor/videopremium.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import unicode_literals - -import re -import random - -from .common import InfoExtractor - - -class VideoPremiumIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?videopremium\.(?:tv|me)/(?P\w+)(?:/.*)?' - _TEST = { - 'url': 'http://videopremium.tv/4w7oadjsf156', - 'info_dict': { - 'id': '4w7oadjsf156', - 'ext': 'f4v', - 'title': 'youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4' - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Test file has been deleted.', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage_url = 'http://videopremium.tv/' + video_id - webpage = self._download_webpage(webpage_url, video_id) - - if re.match(r'^]*>window\.location\s*=', webpage): - # Download again, we need a cookie - webpage = self._download_webpage( - webpage_url, video_id, - note='Downloading webpage again (with cookie)') - - video_title = self._html_search_regex( - r'\s*(.+?)\s*<', webpage, 'video title') - - return { - 'id': video_id, - 'url': 'rtmp://e%d.md.iplay.md/play' % random.randint(1, 16), - 'play_path': 'mp4:%s.f4v' % video_id, - 'page_url': 'http://videopremium.tv/' + video_id, - 'player_url': 'http://videopremium.tv/uplayer/uppod.swf', - 'ext': 'f4v', - 'title': video_title, - } diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 851ad936c..d6b92b1c8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -1,28 +1,62 @@ from __future__ import unicode_literals -import base64 +import json import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, - clean_html, - determine_ext, int_or_none, - js_to_json, parse_age_limit, - parse_duration, - try_get, ) class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' + _API_BASE = 'https://prod-api.viewlift.com/' + _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv' + _SITE_MAP = { + 'ftfnext': 'lax', + 'funnyforfree': 'snagfilms', + 'hoichoi': 'hoichoitv', + 'kiddovid': 'snagfilms', + 'laxsportsnetwork': 'lax', + 'legapallacanestro': 'lnp', + 'marquee': 'marquee-tv', + 'monumentalsportsnetwork': 'monumental-network', + 'moviespree': 'bingeflix', + 'pflmma': 'pfl', + 'snagxtreme': 'snagfilms', + 'theidentitytb': 'tampabay', + 'vayafilm': 'snagfilms', + } + _TOKENS = {} + + def _call_api(self, site, path, video_id, query): + token = self._TOKENS.get(site) + if not token: + token_query = {'site': site} + email, password = self._get_login_info(netrc_machine=site) + if email: + resp = self._download_json( + self._API_BASE + 'identity/signin', video_id, + 'Logging in', query=token_query, data=json.dumps({ + 'email': email, + 'password': password, + }).encode()) + else: + resp = self._download_json( + self._API_BASE + 'identity/anonymous-token', video_id, + 'Downloading authorization token', query=token_query) + self._TOKENS[site] = token = resp['authorizationToken'] + return self._download_json( + self._API_BASE + path, video_id, + headers={'Authorization': token}, query=query) class ViewLiftEmbedIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX + IE_NAME = 'viewlift:embed' + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -30,6 +64,9 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'id': '74849a00-85a9-11e1-9660-123139220831', 'ext': 'mp4', 'title': '#whilewewatch', + 'description': 'md5:b542bef32a6f657dadd0df06e26fb0c8', + 'timestamp': 1334350096, + 'upload_date': '20120413', } }, { # invalid labels, 360p is better that 480p @@ -39,7 +76,8 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', 'ext': 'mp4', 'title': 'Life in Limbo', - } + }, + 'skip': 'The video does not exist', }, { 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', 'only_matching': True, @@ -54,67 +92,68 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): return mobj.group('url') def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - if '>This film is not playable in your area.<' in webpage: - raise ExtractorError( - 'Film %s is not playable in your area.' % video_id, expected=True) + domain, film_id = re.match(self._VALID_URL, url).groups() + site = domain.split('.')[-2] + if site in self._SITE_MAP: + site = self._SITE_MAP[site] + try: + content_data = self._call_api( + site, 'entitlement/video/status', film_id, { + 'id': film_id + })['video'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage') + if error_message == 'User does not have a valid subscription or has not purchased this content.': + self.raise_login_required() + raise ExtractorError(error_message, expected=True) + raise + gist = content_data['gist'] + title = gist['title'] + video_assets = content_data['streamingInfo']['videoAssets'] formats = [] - has_bitrate = False - sources = self._parse_json(self._search_regex( - r'(?s)sources:\s*(\[.+?\]),', webpage, - 'sources', default='[]'), video_id, js_to_json) - for source in sources: - file_ = source.get('file') - if not file_: + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: continue - type_ = source.get('type') - ext = determine_ext(file_) - format_id = source.get('label') or ext - if all(v in ('m3u8', 'hls') for v in (type_, ext)): - formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - bitrate = int_or_none(self._search_regex( - [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], - file_, 'bitrate', default=None)) - if not has_bitrate and bitrate: - has_bitrate = True - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - formats.append({ - 'url': file_, - 'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')), - 'tbr': bitrate, - 'height': height, - }) - if not formats: - hls_url = self._parse_json(self._search_regex( - r'filmInfo\.src\s*=\s*({.+?});', - webpage, 'src'), video_id, js_to_json)['src'] - formats = self._extract_m3u8_formats( - hls_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') - self._sort_formats(formats, field_preference) + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) - title = self._search_regex( - [r"title\s*:\s*'([^']+)'", r'([^<]+)'], - webpage, 'title') + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'format_id')) - return { - 'id': video_id, + info = { + 'id': film_id, 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), 'formats': formats, } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info class ViewLiftIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?P%s)(?:/(?:films/title|show|(?:news/)?videos?))?/(?P[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX + IE_NAME = 'viewlift' + _VALID_URL = r'https?://(?:www\.)?(?P%s)(?P(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://www.snagfilms.com/films/title/lost_for_life', 'md5': '19844f897b35af219773fd63bdec2942', @@ -151,10 +190,13 @@ class ViewLiftIE(ViewLiftBaseIE): 'id': '00000148-7b53-de26-a9fb-fbf306f70020', 'display_id': 'augie_alone/s_2_ep_12_love', 'ext': 'mp4', - 'title': 'Augie, Alone:S. 2 Ep. 12 - Love', - 'description': 'md5:db2a5c72d994f16a780c1eb353a8f403', + 'title': 'S. 2 Ep. 12 - Love', + 'description': 'Augie finds love.', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 107, + 'upload_date': '20141012', + 'timestamp': 1413129540, + 'age_limit': 17, }, 'params': { 'skip_download': True, @@ -177,6 +219,9 @@ class ViewLiftIE(ViewLiftBaseIE): # Was once Kaltura embed 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', 'only_matching': True, + }, { + 'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters', + 'only_matching': True, }] @classmethod @@ -184,119 +229,22 @@ class ViewLiftIE(ViewLiftBaseIE): return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url) def _real_extract(self, url): - domain, display_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, display_id) - - if ">Sorry, the Film you're looking for is not available.<" in webpage: - raise ExtractorError( - 'Film %s is not available.' % display_id, expected=True) - - initial_store_state = self._search_regex( - r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)", - webpage, 'Initial Store State', default=None) - if initial_store_state: - modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode( - initial_store_state).decode()), display_id)['page']['data']['modules'] - content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule') - gist = content_data['gist'] - film_id = gist['id'] - title = gist['title'] - video_assets = try_get( - content_data, lambda x: x['streamingInfo']['videoAssets'], dict) - if not video_assets: - token = self._download_json( - 'https://prod-api.viewlift.com/identity/anonymous-token', - film_id, 'Downloading authorization token', - query={'site': 'snagfilms'})['authorizationToken'] - video_assets = self._download_json( - 'https://prod-api.viewlift.com/entitlement/video/status', - film_id, headers={ - 'Authorization': token, - 'Referer': url, - }, query={ - 'id': film_id - })['video']['streamingInfo']['videoAssets'] - - formats = [] - mpeg_video_assets = video_assets.get('mpeg') or [] - for video_asset in mpeg_video_assets: - video_asset_url = video_asset.get('url') - if not video_asset: - continue - bitrate = int_or_none(video_asset.get('bitrate')) - height = int_or_none(self._search_regex( - r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), - 'height', default=None)) - formats.append({ - 'url': video_asset_url, - 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), - 'tbr': bitrate, - 'height': height, - 'vcodec': video_asset.get('codec'), - }) - - hls_url = video_assets.get('hls') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats, ('height', 'tbr', 'format_id')) - - info = { - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': gist.get('description'), - 'thumbnail': gist.get('videoImageUrl'), - 'duration': int_or_none(gist.get('runtime')), - 'age_limit': parse_age_limit(content_data.get('parentalRating')), - 'timestamp': int_or_none(gist.get('publishDate'), 1000), - 'formats': formats, - } - for k in ('categories', 'tags'): - info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] - return info - else: - film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') - - snag = self._parse_json( - self._search_regex( - r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'), - display_id) - - for item in snag: - if item.get('data', {}).get('film', {}).get('id') == film_id: - data = item['data']['film'] - title = data['title'] - description = clean_html(data.get('synopsis')) - thumbnail = data.get('image') - duration = int_or_none(data.get('duration') or data.get('runtime')) - categories = [ - category['title'] for category in data.get('categories', []) - if category.get('title')] - break - else: - title = self._html_search_regex( - (r'itemprop="title">([^<]+)<', - r'(?s)itemprop="title">(.+?)(.+?)', - webpage, 'description', default=None) or self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'([^<]+)<', - webpage, 'duration', fatal=False)) - categories = re.findall(r'([^<]+)', webpage) - - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'categories': categories, - 'ie_key': 'ViewLiftEmbed', - } + domain, path, display_id = re.match(self._VALID_URL, url).groups() + site = domain.split('.')[-2] + if site in self._SITE_MAP: + site = self._SITE_MAP[site] + modules = self._call_api( + site, 'content/pages', display_id, { + 'includeContent': 'true', + 'moduleOffset': 1, + 'path': path, + 'site': site, + })['modules'] + film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule') + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'ie_key': 'ViewLiftEmbed', + } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 9abd59d98..8cd611e1e 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -15,22 +15,25 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + clean_html, determine_ext, + dict_get, ExtractorError, js_to_json, int_or_none, merge_dicts, - NO_DEFAULT, OnDemandPagedList, parse_filesize, RegexNotFoundError, sanitized_Request, smuggle_url, std_headers, + str_or_none, try_get, unified_timestamp, unsmuggle_url, urlencode_postdata, + urljoin, unescapeHTML, ) @@ -189,7 +192,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): for tt in text_tracks: subtitles[tt['lang']] = [{ 'ext': 'vtt', - 'url': 'https://vimeo.com' + tt['url'], + 'url': urljoin('https://vimeo.com', tt['url']), }] thumbnails = [] @@ -210,7 +213,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_uploader_url = owner.get('url') return { - 'id': video_id, + 'id': str_or_none(video_data.get('id')) or video_id, 'title': self._live_title(video_title) if is_live else video_title, 'uploader': owner.get('name'), 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, @@ -258,11 +261,11 @@ class VimeoIE(VimeoBaseInfoExtractor): (?: (?: www| - (?Pplayer) + player ) \. )? - vimeo(?Ppro)?\.com/ + vimeo(?:pro)?\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?:.*?/)? (?: @@ -284,7 +287,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '56015672', 'ext': 'mp4', 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - 'description': 'md5:509a9ad5c9bf97c60faee9203aca4479', + 'description': 'md5:2d3305bad981a06ff79f027f19865021', 'timestamp': 1355990239, 'upload_date': '20121220', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434', @@ -293,6 +296,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 10, 'license': 'by-sa', }, + 'params': { + 'format': 'best[protocol=https]', + }, }, { 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', @@ -305,8 +311,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:fd69a7b8d8c34a4e1d2ec2e4afd6ec30', + 'description': 'md5:2c362968038d4499f4d79f88458590c1', 'duration': 1595, + 'upload_date': '20130610', + 'timestamp': 1370893156, + }, + 'params': { + 'format': 'best[protocol=https]', }, }, { @@ -323,6 +334,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 3610, 'description': None, }, + 'params': { + 'format': 'best[protocol=https]', + }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://vimeo.com/68375962', @@ -341,6 +356,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', }, 'params': { + 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, }, @@ -441,10 +457,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': '10Ft Films', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/tenfootfilms', 'uploader_id': 'tenfootfilms', + 'description': 'md5:0fa704e05b04f91f40b7f3ca2e801384', + 'upload_date': '20130830', + 'timestamp': 1377853339, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://player.vimeo.com/video/68375962', @@ -459,6 +479,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 10, }, 'params': { + 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, }, @@ -523,7 +544,7 @@ class VimeoIE(VimeoBaseInfoExtractor): def _verify_player_video_password(self, url, video_id, headers): password = self._downloader.params.get('videopassword') if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option') + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) data = urlencode_postdata({ 'password': base64.b64encode(password.encode()), }) @@ -552,28 +573,26 @@ class VimeoIE(VimeoBaseInfoExtractor): r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) # Extract ID from URL - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) orig_url = url - if mobj.group('pro'): + is_pro = 'vimeopro.com/' in url + is_player = '://player.vimeo.com/video/' in url + if is_pro: # some videos require portfolio_id to be present in player url # https://github.com/ytdl-org/youtube-dl/issues/20070 url = self._extract_url(url, self._download_webpage(url, video_id)) - elif mobj.group('player'): + if not url: + url = 'https://vimeo.com/' + video_id + elif is_player: url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id - # Retrieve video webpage to extract further information - request = sanitized_Request(url, headers=headers) try: - webpage, urlh = self._download_webpage_handle(request, video_id) - redirect_url = compat_str(urlh.geturl()) - # Some URLs redirect to ondemand can't be extracted with - # this extractor right away thus should be passed through - # ondemand extractor (e.g. https://vimeo.com/73445910) - if VimeoOndemandIE.suitable(redirect_url): - return self.url_result(redirect_url, VimeoOndemandIE.ie_key()) + # Retrieve video webpage to extract further information + webpage, urlh = self._download_webpage_handle( + url, video_id, headers=headers) + redirect_url = urlh.geturl() except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() @@ -600,6 +619,7 @@ class VimeoIE(VimeoBaseInfoExtractor): cc_license = None timestamp = None + video_description = None # Extract the config JSON try: @@ -611,17 +631,17 @@ class VimeoIE(VimeoBaseInfoExtractor): # Sometimes new react-based page is served instead of old one that require # different config URL extraction approach (see # https://github.com/ytdl-org/youtube-dl/pull/7209) - vimeo_clip_page_config = self._search_regex( - r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, - 'vimeo clip page config') - page_config = self._parse_json(vimeo_clip_page_config, video_id) + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config'), video_id) config_url = page_config['player']['config_url'] cc_license = page_config.get('cc_license') timestamp = try_get( page_config, lambda x: x['clip']['uploaded_on'], compat_str) - config_json = self._download_webpage(config_url, video_id) - config = json.loads(config_json) + video_description = clean_html(dict_get( + page_config, ('description', 'description_html_escaped'))) + config = self._download_json(config_url, video_id) except RegexNotFoundError: # For pro videos or player.vimeo.com urls # We try to find out to which variable is assigned the config dic @@ -675,14 +695,14 @@ class VimeoIE(VimeoBaseInfoExtractor): {'force_feature_id': True}), 'Vimeo') # Extract video description - - video_description = self._html_search_regex( - r'(?s)]*>(.*?)', - webpage, 'description', default=None) + if not video_description: + video_description = self._html_search_regex( + r'(?s)]*>(.*?)', + webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( 'description', webpage, default=None) - if not video_description and mobj.group('pro'): + if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, note='Downloading webpage for description', @@ -690,7 +710,7 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not mobj.group('player'): + if not video_description and not is_player: self._downloader.report_warning('Cannot find video description') # Extract upload date @@ -747,9 +767,9 @@ class VimeoIE(VimeoBaseInfoExtractor): return info_dict -class VimeoOndemandIE(VimeoBaseInfoExtractor): +class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', @@ -761,24 +781,32 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'uploader': 'גם סרטים', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', 'uploader_id': 'gumfilms', + 'description': 'md5:4c027c965e439de4baab621e48b60791', + 'upload_date': '20140906', + 'timestamp': 1410032453, }, 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { # requires Referer to be passed along with og:video:url 'url': 'https://vimeo.com/ondemand/36938/126682985', 'info_dict': { - 'id': '126682985', + 'id': '126584684', 'ext': 'mp4', 'title': 'Rävlock, rätt läte på rätt plats', 'uploader': 'Lindroth & Norin', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user14430847', - 'uploader_id': 'user14430847', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/lindrothnorin', + 'uploader_id': 'lindrothnorin', + 'description': 'md5:c3c46a90529612c8279fb6af803fc0df', + 'upload_date': '20150502', + 'timestamp': 1430586422, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -790,16 +818,6 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - return self.url_result( - # Some videos require Referer to be passed along with og:video:url - # similarly to generic vimeo embeds (e.g. - # https://vimeo.com/ondemand/36938/126682985). - VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url), - VimeoIE.ie_key()) - class VimeoChannelIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:channel' @@ -815,6 +833,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): }, 'playlist_mincount': 25, }] + _BASE_URL_TEMPL = 'https://vimeo.com/channels/%s' def _page_url(self, base_url, pagenum): return '%s/videos/page:%d/' % (base_url, pagenum) @@ -823,33 +842,6 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return self._TITLE or self._html_search_regex( self._TITLE_RE, webpage, 'list title', fatal=False) - def _login_list_password(self, page_url, list_id, webpage): - login_form = self._search_regex( - r'(?s)]+?id="pw_form"(.*?)', - webpage, 'login form', default=None) - if not login_form: - return webpage - - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True) - fields = self._hidden_inputs(login_form) - token, vuid = self._extract_xsrft_and_vuid(webpage) - fields['token'] = token - fields['password'] = password - post = urlencode_postdata(fields) - password_path = self._search_regex( - r'action="([^"]+)"', login_form, 'password URL') - password_url = compat_urlparse.urljoin(page_url, password_path) - password_request = sanitized_Request(password_url, post) - password_request.add_header('Content-type', 'application/x-www-form-urlencoded') - self._set_vimeo_cookie('vuid', vuid) - self._set_vimeo_cookie('xsrft', token) - - return self._download_webpage( - password_request, list_id, - 'Verifying the password', 'Wrong password') - def _title_and_entries(self, list_id, base_url): for pagenum in itertools.count(1): page_url = self._page_url(base_url, pagenum) @@ -858,7 +850,6 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): 'Downloading page %s' % pagenum) if pagenum == 1: - webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) # Try extracting href first since not all videos are available via @@ -886,14 +877,13 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return self.playlist_result(title_and_entries, list_id, list_title) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - return self._extract_videos(channel_id, 'https://vimeo.com/channels/%s' % channel_id) + channel_id = self._match_id(url) + return self._extract_videos(channel_id, self._BASE_URL_TEMPL % channel_id) class VimeoUserIE(VimeoChannelIE): IE_NAME = 'vimeo:user' - _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P[^/]+)(?:/videos|[#?]|$)' + _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P[^/]+)(?:/videos|[#?]|$)' _TITLE_RE = r']+?class="user">([^<>]+?)' _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', @@ -903,14 +893,10 @@ class VimeoUserIE(VimeoChannelIE): }, 'playlist_mincount': 66, }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - return self._extract_videos(name, 'https://vimeo.com/%s' % name) + _BASE_URL_TEMPL = 'https://vimeo.com/%s' -class VimeoAlbumIE(VimeoChannelIE): +class VimeoAlbumIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:album' _VALID_URL = r'https://vimeo\.com/(?:album|showcase)/(?P\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'