diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 52c5c1c32..97b8afcf9 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.09.01** +- [ ] I've verified that I'm running youtube-dl version **2020.01.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.01 + [debug] youtube-dl version 2020.01.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 000d2f55a..de6c44a65 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.09.01** +- [ ] I've verified that I'm running youtube-dl version **2020.01.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 77b710606..a9dd5ca52 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.01** +- [ ] I've verified that I'm running youtube-dl version **2020.01.01** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index ae112d965..8347903ea 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.09.01** +- [ ] I've verified that I'm running youtube-dl version **2020.01.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.01 + [debug] youtube-dl version 2020.01.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index fccfdf71a..92228513c 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.01** +- [ ] I've verified that I'm running youtube-dl version **2020.01.01** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/.travis.yml b/.travis.yml index 6d16c2955..14d95fa84 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,12 @@ matrix: - python: 3.7 dist: xenial env: YTDL_TEST_SET=download + - python: 3.8 + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.8 + dist: xenial + env: YTDL_TEST_SET=download - python: 3.8-dev dist: xenial env: YTDL_TEST_SET=core diff --git a/ChangeLog b/ChangeLog index e91e49854..c33169cd8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,350 @@ +version 2020.01.01 + +Extractors +* [brightcove] Invalidate policy key cache on failing requests +* [pornhub] Improve locked videos detection (#22449, #22780) ++ [pornhub] Add support for m3u8 formats +* [pornhub] Fix extraction (#22749, #23082) +* [brightcove] Update policy key on failing requests +* [spankbang] Improve removed video detection (#23423) +* [spankbang] Fix extraction (#23307, #23423, #23444) +* [soundcloud] Automatically update client id on failing requests +* [prosiebensat1] Improve geo restriction handling (#23571) +* [brightcove] Cache brightcove player policy keys +* [teachable] Fail with error message if no video URL found +* [teachable] Improve locked lessons detection (#23528) ++ [scrippsnetworks] Add support for Scripps Networks sites (#19857, #22981) +* [mitele] Fix extraction (#21354, #23456) +* [soundcloud] Update client id (#23516) +* [mailru] Relax URL regular expressions (#23509) + + +version 2019.12.25 + +Core +* [utils] Improve str_to_int ++ [downloader/hls] Add ability to override AES decryption key URL (#17521) + +Extractors +* [mediaset] Fix parse formats (#23508) ++ [tv2dk:bornholm:play] Add support for play.tv2bornholm.dk (#23291) ++ [slideslive] Add support for url and vimeo service names (#23414) +* [slideslive] Fix extraction (#23413) +* [twitch:clips] Fix extraction (#23375) ++ [soundcloud] Add support for token protected embeds (#18954) +* [vk] Improve extraction + * Fix User Videos extraction (#23356) + * Extract all videos for lists with more than 1000 videos (#23356) + + Add support for video albums (#14327, #14492) +- [kontrtube] Remove extractor +- [videopremium] Remove extractor +- [musicplayon] Remove extractor (#9225) ++ [ufctv] Add support for ufcfightpass.imgdge.com and + ufcfightpass.imggaming.com (#23343) ++ [twitch] Extract m3u8 formats frame rate (#23333) ++ [imggaming] Add support for playlists and extract subtitles ++ [ufcarabia] Add support for UFC Arabia (#23312) +* [ufctv] Fix extraction +* [yahoo] Fix gyao brightcove player id (#23303) +* [vzaar] Override AES decryption key URL (#17521) ++ [vzaar] Add support for AES HLS manifests (#17521, #23299) +* [nrl] Fix extraction +* [teachingchannel] Fix extraction +* [nintendo] Fix extraction and partially add support for Nintendo Direct + videos (#4592) ++ [ooyala] Add better fallback values for domain and streams variables ++ [youtube] Add support youtubekids.com (#23272) +* [tv2] Detect DRM protection ++ [tv2] Add support for katsomo.fi and mtv.fi (#10543) +* [tv2] Fix tv2.no article extraction +* [msn] Improve extraction + + Add support for YouTube and NBCSports embeds + + Add support for articles with multiple videos + * Improve AOL embed support + * Improve format extraction +* [abcotvs] Relax URL regular expression and improve metadata extraction + (#18014) +* [channel9] Reduce response size +* [adobetv] Improve extaction + * Use OnDemandPagedList for list extractors + * Reduce show extraction requests + * Extract original video format and subtitles + + Add support for adobe tv embeds + + +version 2019.11.28 + +Core ++ [utils] Add generic caesar cipher and rot47 +* [utils] Handle rd-suffixed day parts in unified_strdate (#23199) + +Extractors +* [vimeo] Improve extraction + * Fix review extraction + * Fix ondemand extraction + * Make password protected player case as an expected error (#22896) + * Simplify channel based extractors code +- [openload] Remove extractor (#11999) +- [verystream] Remove extractor +- [streamango] Remove extractor (#15406) +* [dailymotion] Improve extraction + * Extract http formats included in m3u8 manifest + * Fix user extraction (#3553, #21415) + + Add suport for User Authentication (#11491) + * Fix password protected videos extraction (#23176) + * Respect age limit option and family filter cookie value (#18437) + * Handle video url playlist query param + * Report allowed countries for geo-restricted videos +* [corus] Improve extraction + + Add support for Series Plus, W Network, YTV, ABC Spark, disneychannel.com + and disneylachaine.ca (#20861) + + Add support for self hosted videos (#22075) + * Detect DRM protection (#14910, #9164) +* [vivo] Fix extraction (#22328, #22279) ++ [bitchute] Extract upload date (#22990, #23193) +* [soundcloud] Update client id (#23214) + + +version 2019.11.22 + +Core ++ [extractor/common] Clean jwplayer description HTML tags ++ [extractor/common] Add data, headers and query to all major extract formats + methods + +Extractors +* [chaturbate] Fix extraction (#23010, #23012) ++ [ntvru] Add support for non relative file URLs (#23140) +* [vk] Fix wall audio thumbnails extraction (#23135) +* [ivi] Fix format extraction (#21991) +- [comcarcoff] Remove extractor ++ [drtv] Add support for new URL schema (#23059) ++ [nexx] Add support for Multi Player JS Setup (#23052) ++ [teamcoco] Add support for new videos (#23054) +* [soundcloud] Check if the soundtrack has downloads left (#23045) +* [facebook] Fix posts video data extraction (#22473) +- [addanime] Remove extractor +- [minhateca] Remove extractor +- [daisuki] Remove extractor +* [seeker] Fix extraction +- [revision3] Remove extractors +* [twitch] Fix video comments URL (#18593, #15828) +* [twitter] Improve extraction + + Add support for generic embeds (#22168) + * Always extract http formats for native videos (#14934) + + Add support for Twitter Broadcasts (#21369) + + Extract more metadata + * Improve VMap format extraction + * Unify extraction code for both twitter statuses and cards ++ [twitch] Add support for Clip embed URLs +* [lnkgo] Fix extraction (#16834) +* [mixcloud] Improve extraction + * Improve metadata extraction (#11721) + * Fix playlist extraction (#22378) + * Fix user mixes extraction (#15197, #17865) ++ [kinja] Add support for Kinja embeds (#5756, #11282, #22237, #22384) +* [onionstudios] Fix extraction ++ [hotstar] Pass Referer header to format requests (#22836) +* [dplay] Minimize response size ++ [patreon] Extract uploader_id and filesize +* [patreon] Minimize response size +* [roosterteeth] Fix login request (#16094, #22689) + + +version 2019.11.05 + +Extractors ++ [scte] Add support for learning.scte.org (#22975) ++ [msn] Add support for Vidible and AOL embeds (#22195, #22227) +* [myspass] Fix video URL extraction and improve metadata extraction (#22448) +* [jamendo] Improve extraction + * Fix album extraction (#18564) + * Improve metadata extraction (#18565, #21379) +* [mediaset] Relax URL guid matching (#18352) ++ [mediaset] Extract unprotected M3U and MPD manifests (#17204) +* [telegraaf] Fix extraction ++ [bellmedia] Add support for marilyn.ca videos (#22193) +* [stv] Fix extraction (#22928) +- [iconosquare] Remove extractor +- [keek] Remove extractor +- [gameone] Remove extractor (#21778) +- [flipagram] Remove extractor +- [bambuser] Remove extractor +* [wistia] Reduce embed extraction false positives ++ [wistia] Add support for inline embeds (#22931) +- [go90] Remove extractor +* [kakao] Remove raw request ++ [kakao] Extract format total bitrate +* [daum] Fix VOD and Clip extracton (#15015) +* [kakao] Improve extraction + + Add support for embed URLs + + Add support for Kakao Legacy vid based embed URLs + * Only extract fields used for extraction + * Strip description and extract tags +* [mixcloud] Fix cloudcast data extraction (#22821) +* [yahoo] Improve extraction + + Add support for live streams (#3597, #3779, #22178) + * Bypass cookie consent page for european domains (#16948, #22576) + + Add generic support for embeds (#20332) +* [tv2] Fix and improve extraction (#22787) ++ [tv2dk] Add support for TV2 DK sites +* [onet] Improve extraction … + + Add support for onet100.vod.pl + + Extract m3u8 formats + * Correct audio only format info +* [fox9] Fix extraction + + +version 2019.10.29 + +Core +* [utils] Actualize major IPv4 address blocks per country + +Extractors ++ [go] Add support for abc.com and freeform.com (#22823, #22864) ++ [mtv] Add support for mtvjapan.com +* [mtv] Fix extraction for mtv.de (#22113) +* [videodetective] Fix extraction +* [internetvideoarchive] Fix extraction +* [nbcnews] Fix extraction (#12569, #12576, #21703, #21923) +- [hark] Remove extractor +- [tutv] Remove extractor +- [learnr] Remove extractor +- [macgamestore] Remove extractor +* [la7] Update Kaltura service URL (#22358) +* [thesun] Fix extraction (#16966) +- [makertv] Remove extractor ++ [tenplay] Add support for 10play.com.au (#21446) +* [soundcloud] Improve extraction + * Improve format extraction (#22123) + + Extract uploader_id and uploader_url (#21916) + + Extract all known thumbnails (#19071, #20659) + * Fix extration for private playlists (#20976) + + Add support for playlist embeds (#20976) + * Skip preview formats (#22806) +* [dplay] Improve extraction + + Add support for dplay.fi, dplay.jp and es.dplay.com (#16969) + * Fix it.dplay.com extraction (#22826) + + Extract creator, tags and thumbnails + * Handle playback API call errors ++ [discoverynetworks] Add support for dplay.co.uk +* [vk] Improve extraction + + Add support for Odnoklassniki embeds + + Extract more videos from user lists (#4470) + + Fix wall post audio extraction (#18332) + * Improve error detection (#22568) ++ [odnoklassniki] Add support for embeds +* [puhutv] Improve extraction + * Fix subtitles extraction + * Transform HLS URLs to HTTP URLs + * Improve metadata extraction +* [ceskatelevize] Skip DRM media ++ [facebook] Extract subtitles (#22777) +* [globo] Handle alternative hash signing method + + +version 2019.10.22 + +Core +* [utils] Improve subtitles_filename (#22753) + +Extractors +* [facebook] Bypass download rate limits (#21018) ++ [contv] Add support for contv.com +- [viewster] Remove extractor +* [xfileshare] Improve extractor (#17032, #17906, #18237, #18239) + * Update the list of domains + + Add support for aa-encoded video data + * Improve jwplayer format extraction + + Add support for Clappr sources +* [mangomolo] Fix video format extraction and add support for player URLs +* [audioboom] Improve metadata extraction +* [twitch] Update VOD URL matching (#22395, #22727) +- [mit] Remove support for video.mit.edu (#22403) +- [servingsys] Remove extractor (#22639) +* [dumpert] Fix extraction (#22428, #22564) +* [atresplayer] Fix extraction (#16277, #16716) + + +version 2019.10.16 + +Core +* [extractor/common] Make _is_valid_url more relaxed + +Extractors +* [vimeo] Improve album videos id extraction (#22599) ++ [globo] Extract subtitles (#22713) +* [bokecc] Improve player params extraction (#22638) +* [nexx] Handle result list (#22666) +* [vimeo] Fix VHX embed extraction +* [nbc] Switch to graphql API (#18581, #22693, #22701) +- [vessel] Remove extractor +- [promptfile] Remove extractor (#6239) +* [kaltura] Fix service URL extraction (#22658) +* [kaltura] Fix embed info strip (#22658) +* [globo] Fix format extraction (#20319) +* [redtube] Improve metadata extraction (#22492, #22615) +* [pornhub:uservideos:upload] Fix extraction (#22619) ++ [telequebec:squat] Add support for squat.telequebec.tv (#18503) +- [wimp] Remove extractor (#22088, #22091) ++ [gfycat] Extend URL regular expression (#22225) ++ [chaturbate] Extend URL regular expression (#22309) +* [peertube] Update instances (#22414) ++ [telequebec] Add support for coucou.telequebec.tv (#22482) ++ [xvideos] Extend URL regular expression (#22471) +- [youtube] Remove support for invidious.enkirton.net (#22543) ++ [openload] Add support for oload.monster (#22592) +* [nrktv:seriebase] Fix extraction (#22596) ++ [youtube] Add support for yt.lelux.fi (#22597) +* [orf:tvthek] Make manifest requests non fatal (#22578) +* [teachable] Skip login when already logged in (#22572) +* [viewlift] Improve extraction (#22545) +* [nonktube] Fix extraction (#22544) + + +version 2019.09.28 + +Core +* [YoutubeDL] Honour all --get-* options with --flat-playlist (#22493) + +Extractors +* [vk] Fix extraction (#22522) +* [heise] Fix kaltura embeds extraction (#22514) +* [ted] Check for resources validity and extract subtitled downloads (#22513) ++ [youtube] Add support for + owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya.b32.i2p (#22292) ++ [nhk] Add support for clips +* [nhk] Fix video extraction (#22249, #22353) +* [byutv] Fix extraction (#22070) ++ [openload] Add support for oload.online (#22304) ++ [youtube] Add support for invidious.drycat.fr (#22451) +* [jwplatfom] Do not match video URLs (#20596, #22148) +* [youtube:playlist] Unescape playlist uploader (#22483) ++ [bilibili] Add support audio albums and songs (#21094) ++ [instagram] Add support for tv URLs ++ [mixcloud] Allow uppercase letters in format URLs (#19280) +* [brightcove] Delegate all supported legacy URLs to new extractor (#11523, + #12842, #13912, #15669, #16303) +* [hotstar] Use native HLS downloader by default ++ [hotstar] Extract more formats (#22323) +* [9now] Fix extraction (#22361) +* [zdf] Bypass geo restriction ++ [tv4] Extract series metadata +* [tv4] Fix extraction (#22443) + + +version 2019.09.12.1 + +Extractors +* [youtube] Remove quality and tbr for itag 43 (#22372) + + +version 2019.09.12 + +Extractors +* [youtube] Quick extraction tempfix (#22367, #22163) + + version 2019.09.01 Core @@ -310,7 +657,7 @@ Extractors version 2019.04.17 Extractors -* [openload] Randomize User-Agent (closes #20688) +* [openload] Randomize User-Agent (#20688) + [openload] Add support for oladblock domains (#20471) * [adn] Fix subtitle extraction (#12724) + [aol] Add support for localized websites @@ -875,7 +1222,7 @@ Extractors + [youtube] Extract channel meta fields (#9676, #12939) * [porntube] Fix extraction (#17541) * [asiancrush] Fix extraction (#15630) -+ [twitch:clips] Extend URL regular expression (closes #17559) ++ [twitch:clips] Extend URL regular expression (#17559) + [vzaar] Add support for HLS * [tube8] Fix metadata extraction (#17520) * [eporner] Extract JSON-LD (#17519) diff --git a/README.md b/README.md index c39b13616..01f975958 100644 --- a/README.md +++ b/README.md @@ -752,8 +752,8 @@ As a last resort, you can also uninstall the version installed by your package m Afterwards, simply follow [our manual installation instructions](https://ytdl-org.github.io/youtube-dl/download.html): ``` -sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl -sudo chmod a+x /usr/local/bin/youtube-dl +sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl +sudo chmod a+rx /usr/local/bin/youtube-dl hash -r ``` diff --git a/devscripts/create-github-release.py b/devscripts/create-github-release.py index 428111b3f..2ddfa1096 100644 --- a/devscripts/create-github-release.py +++ b/devscripts/create-github-release.py @@ -1,7 +1,6 @@ #!/usr/bin/env python from __future__ import unicode_literals -import base64 import io import json import mimetypes @@ -15,7 +14,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.compat import ( compat_basestring, - compat_input, compat_getpass, compat_print, compat_urllib_request, @@ -40,28 +38,20 @@ class GitHubReleaser(object): try: info = netrc.netrc().authenticators(self._NETRC_MACHINE) if info is not None: - self._username = info[0] - self._password = info[2] + self._token = info[2] compat_print('Using GitHub credentials found in .netrc...') return else: compat_print('No GitHub credentials found in .netrc') except (IOError, netrc.NetrcParseError): compat_print('Unable to parse .netrc') - self._username = compat_input( - 'Type your GitHub username or email address and press [Return]: ') - self._password = compat_getpass( - 'Type your GitHub password and press [Return]: ') + self._token = compat_getpass( + 'Type your GitHub PAT (personal access token) and press [Return]: ') def _call(self, req): if isinstance(req, compat_basestring): req = sanitized_Request(req) - # Authorizing manually since GitHub does not response with 401 with - # WWW-Authenticate header set (see - # https://developer.github.com/v3/#basic-authentication) - b64 = base64.b64encode( - ('%s:%s' % (self._username, self._password)).encode('utf-8')).decode('ascii') - req.add_header('Authorization', 'Basic %s' % b64) + req.add_header('Authorization', 'token %s' % self._token) response = self._opener.open(req).read().decode('utf-8') return json.loads(response) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 18bddc138..e471aa79a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -26,13 +26,13 @@ - **AcademicEarth:Course** - **acast** - **acast:channel** - - **AddAnime** - **ADN**: Anime Digital Network - **AdobeConnect** - - **AdobeTV** - - **AdobeTVChannel** - - **AdobeTVShow** - - **AdobeTVVideo** + - **adobetv** + - **adobetv:channel** + - **adobetv:embed** + - **adobetv:show** + - **adobetv:video** - **AdultSwim** - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault - **afreecatv**: afreecatv.com @@ -76,8 +76,6 @@ - **awaan:video** - **AZMedien**: AZ Medien videos - **BaiduVideo**: 百度视频 - - **bambuser** - - **bambuser:channel** - **Bandcamp** - **Bandcamp:album** - **Bandcamp:weekly** @@ -98,6 +96,8 @@ - **Bigflix** - **Bild**: Bild.de - **BiliBili** + - **BilibiliAudio** + - **BilibiliAudioAlbum** - **BioBioChileTV** - **BIQLE** - **BitChute** @@ -175,12 +175,12 @@ - **CNN** - **CNNArticle** - **CNNBlogs** - - **ComCarCoff** - **ComedyCentral** - **ComedyCentralFullEpisodes** - **ComedyCentralShortname** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED + - **CONtv** - **Corus** - **Coub** - **Cracked** @@ -202,8 +202,6 @@ - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** - - **DaisukiMotto** - - **DaisukiMottoPlaylist** - **daum.net** - **daum.net:clip** - **daum.net:playlist** @@ -229,7 +227,6 @@ - **DouyuShow** - **DouyuTV**: 斗鱼 - **DPlay** - - **DPlayIt** - **DRBonanza** - **Dropbox** - **DrTuber** @@ -282,12 +279,12 @@ - **FiveThirtyEight** - **FiveTV** - **Flickr** - - **Flipagram** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** - **Formula1** - **FOX** - **FOX9** + - **FOX9News** - **Foxgay** - **foxnews**: Fox News and Fox Business Video - **foxnews:article** @@ -313,8 +310,6 @@ - **FXNetworks** - **Gaia** - **GameInformer** - - **GameOne** - - **gameone:playlist** - **GameSpot** - **GameStar** - **Gaskrank** @@ -329,14 +324,12 @@ - **Globo** - **GloboArticle** - **Go** - - **Go90** - **GodTube** - **Golem** - **GoogleDrive** - **Goshgay** - **GPUTechConf** - **Groupon** - - **Hark** - **hbo** - **HearThisAt** - **Heise** @@ -365,7 +358,6 @@ - **Hungama** - **HungamaSong** - **Hypem** - - **Iconosquare** - **ign.com** - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists @@ -405,14 +397,14 @@ - **Kankan** - **Karaoketv** - **KarriereVideos** - - **keek** + - **Katsomo** - **KeezMovies** - **Ketnet** - **KhanAcademy** - **KickStarter** + - **KinjaEmbed** - **KinoPoisk** - **KonserthusetPlay** - - **kontrtube**: KontrTube.ru - Труба зовёт - **KrasView**: Красвью - **Ku6** - **KUSI** @@ -429,7 +421,6 @@ - **Lcp** - **LcpPlay** - **Le**: 乐视网 - - **Learnr** - **Lecture2Go** - **Lecturio** - **LecturioCourse** @@ -463,11 +454,9 @@ - **lynda**: lynda.com videos - **lynda:course**: lynda.com online courses - **m6** - - **macgamestore**: MacGameStore trailers - **mailru**: Видео@Mail.Ru - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru - - **MakerTV** - **MallTV** - **mangomolo:live** - **mangomolo:video** @@ -494,14 +483,12 @@ - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** - - **Minhateca** - **MinistryGrid** - **Minoto** - **miomio.tv** - **MiTele**: mitele.es - **mixcloud** - **mixcloud:playlist** - - **mixcloud:stream** - **mixcloud:user** - **Mixer:live** - **Mixer:vod** @@ -523,11 +510,10 @@ - **mtg**: MTG services - **mtv** - **mtv.de** - - **mtv81** - **mtv:video** + - **mtvjapan** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - - **MusicPlayOn** - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses - **Mwave** @@ -632,7 +618,6 @@ - **OnionStudios** - **Ooyala** - **OoyalaExternal** - - **Openload** - **OraTV** - **orf:fm4**: radio FM4 - **orf:fm4:story**: fm4.orf.at stories @@ -692,7 +677,6 @@ - **PornoXO** - **PornTube** - **PressTV** - - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital - **puhutv** - **puhutv:serie** @@ -733,8 +717,6 @@ - **Restudy** - **Reuters** - **ReverbNation** - - **revision** - - **revision3:embed** - **RICE** - **RMCDecouverte** - **RockstarGames** @@ -779,11 +761,13 @@ - **screen.yahoo:search**: Yahoo screen search - **Screencast** - **ScreencastOMatic** + - **ScrippsNetworks** - **scrippsnetworks:watch** + - **SCTE** + - **SCTECourse** - **Seeker** - **SenateISVP** - **SendtoNews** - - **ServingSys** - **Servus** - **Sexu** - **SeznamZpravy** @@ -814,6 +798,7 @@ - **soundcloud:set** - **soundcloud:trackstation** - **soundcloud:user** + - **SoundcloudEmbed** - **soundgasm** - **soundgasm:profile** - **southpark.cc.com** @@ -840,7 +825,6 @@ - **Steam** - **Stitcher** - **Streamable** - - **Streamango** - **streamcloud.eu** - **StreamCZ** - **StreetVoice** @@ -882,9 +866,11 @@ - **TeleQuebec** - **TeleQuebecEmission** - **TeleQuebecLive** + - **TeleQuebecSquat** - **TeleTask** - **Telewebion** - **TennisTV** + - **TenPlay** - **TF1** - **TFO** - **TheIntercept** @@ -923,11 +909,12 @@ - **tunein:topic** - **TunePk** - **Turbo** - - **Tutv** - **tv.dfb.de** - **TV2** - **tv2.hu** - **TV2Article** + - **TV2DK** + - **TV2DKBornholmPlay** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ - **TVA** @@ -964,10 +951,12 @@ - **twitch:vod** - **twitter** - **twitter:amplify** + - **twitter:broadcast** - **twitter:card** - **udemy** - **udemy:course** - **UDNEmbed**: 聯合影音 + - **UFCArabia** - **UFCTV** - **UKTVPlay** - **umg:de**: Universal Music Deutschland @@ -988,8 +977,6 @@ - **Vbox7** - **VeeHD** - **Veoh** - - **verystream** - - **Vessel** - **Vesti**: Вести.Ru - **Vevo** - **VevoPlaylist** @@ -1004,13 +991,11 @@ - **Viddler** - **Videa** - **video.google:search**: Google Video search - - **video.mit.edu** - **VideoDetective** - **videofy.me** - **videomore** - **videomore:season** - **videomore:video** - - **VideoPremium** - **VideoPress** - **Vidio** - **VidLii** @@ -1022,7 +1007,6 @@ - **vier:videos** - **ViewLift** - **ViewLiftEmbed** - - **Viewster** - **Viidea** - **viki** - **viki:channel** @@ -1088,7 +1072,6 @@ - **Weibo** - **WeiboMobile** - **WeiqiTV**: WQTV - - **Wimp** - **Wistia** - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **WorldStarHipHop** @@ -1097,7 +1080,7 @@ - **WWE** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me + - **XFileShare**: XFileShare based sites: ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing - **XHamster** - **XHamsterEmbed** - **XHamsterUser** diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 465ce0050..81056a999 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -123,12 +123,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['pbs']) self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['pbs']) - def test_yahoo_https(self): - # https://github.com/ytdl-org/youtube-dl/issues/2701 - self.assertMatch( - 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', - ['Yahoo']) - def test_no_duplicated_ie_names(self): name_accu = collections.defaultdict(list) for ie in self.ies: diff --git a/test/test_utils.py b/test/test_utils.py index 659c6ece5..0896f4150 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -19,6 +19,7 @@ from youtube_dl.utils import ( age_restricted, args_to_str, encode_base_n, + caesar, clean_html, date_from_str, DateRange, @@ -69,11 +70,13 @@ from youtube_dl.utils import ( remove_start, remove_end, remove_quotes, + rot47, shell_quote, smuggle_url, str_to_int, strip_jsonp, strip_or_none, + subtitles_filename, timeconvert, unescapeHTML, unified_strdate, @@ -261,6 +264,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp') self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp') + def test_subtitles_filename(self): + self.assertEqual(subtitles_filename('abc.ext', 'en', 'vtt'), 'abc.en.vtt') + self.assertEqual(subtitles_filename('abc.ext', 'en', 'vtt', 'ext'), 'abc.en.vtt') + self.assertEqual(subtitles_filename('abc.unexpected_ext', 'en', 'vtt', 'ext'), 'abc.unexpected_ext.en.vtt') + def test_remove_start(self): self.assertEqual(remove_start(None, 'A - '), None) self.assertEqual(remove_start('A - B', 'A - '), 'B') @@ -334,6 +342,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_strdate('July 15th, 2013'), '20130715') self.assertEqual(unified_strdate('September 1st, 2013'), '20130901') self.assertEqual(unified_strdate('Sep 2nd, 2013'), '20130902') + self.assertEqual(unified_strdate('November 3rd, 2019'), '20191103') + self.assertEqual(unified_strdate('October 23rd, 2005'), '20051023') def test_unified_timestamps(self): self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600) @@ -489,6 +499,12 @@ class TestUtil(unittest.TestCase): def test_str_to_int(self): self.assertEqual(str_to_int('123,456'), 123456) self.assertEqual(str_to_int('123.456'), 123456) + self.assertEqual(str_to_int(523), 523) + # Python 3 has no long + if sys.version_info < (3, 0): + eval('self.assertEqual(str_to_int(123456L), 123456)') + self.assertEqual(str_to_int('noninteger'), None) + self.assertEqual(str_to_int([]), None) def test_url_basename(self): self.assertEqual(url_basename('http://foo.de/'), '') @@ -1361,6 +1377,20 @@ Line 1 self.assertRaises(ValueError, encode_base_n, 0, 70) self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table) + def test_caesar(self): + self.assertEqual(caesar('ace', 'abcdef', 2), 'cea') + self.assertEqual(caesar('cea', 'abcdef', -2), 'ace') + self.assertEqual(caesar('ace', 'abcdef', -2), 'eac') + self.assertEqual(caesar('eac', 'abcdef', 2), 'ace') + self.assertEqual(caesar('ace', 'abcdef', 0), 'ace') + self.assertEqual(caesar('xyz', 'abcdef', 2), 'xyz') + self.assertEqual(caesar('abc', 'acegik', 2), 'ebg') + self.assertEqual(caesar('ebg', 'acegik', -2), 'abc') + + def test_rot47(self): + self.assertEqual(rot47('youtube-dl'), r'J@FEF36\5=') + self.assertEqual(rot47('YOUTUBE-DL'), r'*~&%&qt\s{') + def test_urshift(self): self.assertEqual(urshift(3, 1), 1) self.assertEqual(urshift(-3, 1), 2147483646) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6a44bc7ba..f5cb46308 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -852,8 +852,9 @@ class YoutubeDL(object): extract_flat = self.params.get('extract_flat', False) if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or extract_flat is True): - if self.params.get('forcejson', False): - self.to_stdout(json.dumps(ie_result)) + self.__forced_printings( + ie_result, self.prepare_filename(ie_result), + incomplete=True) return ie_result if result_type == 'video': @@ -1693,6 +1694,36 @@ class YoutubeDL(object): subs[lang] = f return subs + def __forced_printings(self, info_dict, filename, incomplete): + def print_mandatory(field): + if (self.params.get('force%s' % field, False) + and (not incomplete or info_dict.get(field) is not None)): + self.to_stdout(info_dict[field]) + + def print_optional(field): + if (self.params.get('force%s' % field, False) + and info_dict.get(field) is not None): + self.to_stdout(info_dict[field]) + + print_mandatory('title') + print_mandatory('id') + if self.params.get('forceurl', False) and not incomplete: + if info_dict.get('requested_formats') is not None: + for f in info_dict['requested_formats']: + self.to_stdout(f['url'] + f.get('play_path', '')) + else: + # For RTMP URLs, also include the playpath + self.to_stdout(info_dict['url'] + info_dict.get('play_path', '')) + print_optional('thumbnail') + print_optional('description') + if self.params.get('forcefilename', False) and filename is not None: + self.to_stdout(filename) + if self.params.get('forceduration', False) and info_dict.get('duration') is not None: + self.to_stdout(formatSeconds(info_dict['duration'])) + print_mandatory('format') + if self.params.get('forcejson', False): + self.to_stdout(json.dumps(info_dict)) + def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -1703,9 +1734,8 @@ class YoutubeDL(object): if self._num_downloads >= int(max_downloads): raise MaxDownloadsReached() + # TODO: backward compatibility, to be removed info_dict['fulltitle'] = info_dict['title'] - if len(info_dict['title']) > 200: - info_dict['title'] = info_dict['title'][:197] + '...' if 'format' not in info_dict: info_dict['format'] = info_dict['ext'] @@ -1720,29 +1750,7 @@ class YoutubeDL(object): info_dict['_filename'] = filename = self.prepare_filename(info_dict) # Forced printings - if self.params.get('forcetitle', False): - self.to_stdout(info_dict['fulltitle']) - if self.params.get('forceid', False): - self.to_stdout(info_dict['id']) - if self.params.get('forceurl', False): - if info_dict.get('requested_formats') is not None: - for f in info_dict['requested_formats']: - self.to_stdout(f['url'] + f.get('play_path', '')) - else: - # For RTMP URLs, also include the playpath - self.to_stdout(info_dict['url'] + info_dict.get('play_path', '')) - if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None: - self.to_stdout(info_dict['thumbnail']) - if self.params.get('forcedescription', False) and info_dict.get('description') is not None: - self.to_stdout(info_dict['description']) - if self.params.get('forcefilename', False) and filename is not None: - self.to_stdout(filename) - if self.params.get('forceduration', False) and info_dict.get('duration') is not None: - self.to_stdout(formatSeconds(info_dict['duration'])) - if self.params.get('forceformat', False): - self.to_stdout(info_dict['format']) - if self.params.get('forcejson', False): - self.to_stdout(json.dumps(info_dict)) + self.__forced_printings(info_dict, filename, incomplete=False) # Do nothing else if in simulate mode if self.params.get('simulate', False): @@ -1806,7 +1814,7 @@ class YoutubeDL(object): ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] - sub_filename = subtitles_filename(filename, sub_lang, sub_format) + sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) else: diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index b59aad73f..84bc34928 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -64,7 +64,7 @@ class HlsFD(FragmentFD): s = urlh.read().decode('utf-8', 'ignore') if not self.can_download(s, info_dict): - if info_dict.get('extra_param_to_segment_url'): + if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'): self.report_error('pycrypto not found. Please install it.') return False self.report_warning( @@ -169,7 +169,7 @@ class HlsFD(FragmentFD): if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( - self._prepare_url(info_dict, decrypt_info['URI'])).read() + self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() frag_content = AES.new( decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py index 03b92a39c..0bc69a64f 100644 --- a/youtube_dl/extractor/abcotvs.py +++ b/youtube_dl/extractor/abcotvs.py @@ -4,29 +4,30 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + dict_get, int_or_none, - parse_iso8601, + try_get, ) class ABCOTVSIE(InfoExtractor): IE_NAME = 'abcotvs' IE_DESC = 'ABC Owned Television Stations' - _VALID_URL = r'https?://(?:abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' + _VALID_URL = r'https?://(?Pabc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P[^/]+))?/(?P\d+)' _TESTS = [ { 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', 'info_dict': { - 'id': '472581', + 'id': '472548', 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', 'ext': 'mp4', - 'title': 'East Bay museum celebrates vintage synthesizers', + 'title': 'East Bay museum celebrates synthesized music', 'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3', 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421123075, + 'timestamp': 1421118520, 'upload_date': '20150113', - 'uploader': 'Jonathan Bloom', }, 'params': { # m3u8 download @@ -37,39 +38,63 @@ class ABCOTVSIE(InfoExtractor): 'url': 'http://abc7news.com/472581', 'only_matching': True, }, + { + 'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/', + 'only_matching': True, + }, ] + _SITE_MAP = { + '6abc': 'wpvi', + 'abc11': 'wtvd', + 'abc13': 'ktrk', + 'abc30': 'kfsn', + 'abc7': 'kabc', + 'abc7chicago': 'wls', + 'abc7news': 'kgo', + 'abc7ny': 'wabc', + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + site, display_id, video_id = re.match(self._VALID_URL, url).groups() + display_id = display_id or video_id + station = self._SITE_MAP[site] - webpage = self._download_webpage(url, display_id) + data = self._download_json( + 'https://api.abcotvs.com/v2/content', display_id, query={ + 'id': video_id, + 'key': 'otv.web.%s.story' % station, + 'station': station, + })['data'] + video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data + video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id)) + title = video.get('title') or video['linkText'] - m3u8 = self._html_search_meta( - 'contentURL', webpage, 'm3u8 url', fatal=True).split('?')[0] - - formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') + formats = [] + m3u8_url = video.get('m3u8') + if m3u8_url: + formats = self._extract_m3u8_formats( + video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False) + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'abr': 128, + 'format_id': 'https', + 'height': 360, + 'url': mp4_url, + 'width': 640, + }) self._sort_formats(formats) - title = self._og_search_title(webpage).strip() - description = self._og_search_description(webpage).strip() - thumbnail = self._og_search_thumbnail(webpage) - timestamp = parse_iso8601(self._search_regex( - r'
\s*
', - webpage, 'description', default=None) + if not video_description: + video_description = self._html_search_regex( + r'(?s)]*>(.*?)', + webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( 'description', webpage, default=None) - if not video_description and mobj.group('pro'): + if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, note='Downloading webpage for description', @@ -690,7 +709,7 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not mobj.group('player'): + if not video_description and not is_player: self._downloader.report_warning('Cannot find video description') # Extract upload date @@ -730,7 +749,6 @@ class VimeoIE(VimeoBaseInfoExtractor): channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None info_dict = { - 'id': video_id, 'formats': formats, 'timestamp': unified_timestamp(timestamp), 'description': video_description, @@ -748,9 +766,9 @@ class VimeoIE(VimeoBaseInfoExtractor): return info_dict -class VimeoOndemandIE(VimeoBaseInfoExtractor): +class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', @@ -762,24 +780,32 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'uploader': 'גם סרטים', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', 'uploader_id': 'gumfilms', + 'description': 'md5:4c027c965e439de4baab621e48b60791', + 'upload_date': '20140906', + 'timestamp': 1410032453, }, 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { # requires Referer to be passed along with og:video:url 'url': 'https://vimeo.com/ondemand/36938/126682985', 'info_dict': { - 'id': '126682985', + 'id': '126584684', 'ext': 'mp4', 'title': 'Rävlock, rätt läte på rätt plats', 'uploader': 'Lindroth & Norin', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user14430847', - 'uploader_id': 'user14430847', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/lindrothnorin', + 'uploader_id': 'lindrothnorin', + 'description': 'md5:c3c46a90529612c8279fb6af803fc0df', + 'upload_date': '20150502', + 'timestamp': 1430586422, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -791,16 +817,6 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - return self.url_result( - # Some videos require Referer to be passed along with og:video:url - # similarly to generic vimeo embeds (e.g. - # https://vimeo.com/ondemand/36938/126682985). - VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url), - VimeoIE.ie_key()) - class VimeoChannelIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:channel' @@ -816,6 +832,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): }, 'playlist_mincount': 25, }] + _BASE_URL_TEMPL = 'https://vimeo.com/channels/%s' def _page_url(self, base_url, pagenum): return '%s/videos/page:%d/' % (base_url, pagenum) @@ -887,14 +904,13 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return self.playlist_result(title_and_entries, list_id, list_title) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - return self._extract_videos(channel_id, 'https://vimeo.com/channels/%s' % channel_id) + channel_id = self._match_id(url) + return self._extract_videos(channel_id, self._BASE_URL_TEMPL % channel_id) class VimeoUserIE(VimeoChannelIE): IE_NAME = 'vimeo:user' - _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P[^/]+)(?:/videos|[#?]|$)' + _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P[^/]+)(?:/videos|[#?]|$)' _TITLE_RE = r']+?class="user">([^<>]+?)' _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', @@ -904,11 +920,7 @@ class VimeoUserIE(VimeoChannelIE): }, 'playlist_mincount': 66, }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - return self._extract_videos(name, 'https://vimeo.com/%s' % name) + _BASE_URL_TEMPL = 'https://vimeo.com/%s' class VimeoAlbumIE(VimeoChannelIE): @@ -939,7 +951,7 @@ class VimeoAlbumIE(VimeoChannelIE): def _fetch_page(self, album_id, authorizaion, hashed_pass, page): api_page = page + 1 query = { - 'fields': 'link', + 'fields': 'link,uri', 'page': api_page, 'per_page': self._PAGE_SIZE, } @@ -954,7 +966,9 @@ class VimeoAlbumIE(VimeoChannelIE): link = video.get('link') if not link: continue - yield self.url_result(link, VimeoIE.ie_key(), VimeoIE._match_id(link)) + uri = video.get('uri') + video_id = self._search_regex(r'/videos/(\d+)', uri, 'video_id', default=None) if uri else None + yield self.url_result(link, VimeoIE.ie_key(), video_id) def _real_extract(self, url): album_id = self._match_id(url) @@ -968,25 +982,18 @@ class VimeoAlbumIE(VimeoChannelIE): r'\s*(.+?)(?:\s+on Vimeo)?', webpage, 'title', fatal=False)) -class VimeoGroupsIE(VimeoAlbumIE): +class VimeoGroupsIE(VimeoChannelIE): IE_NAME = 'vimeo:group' - _VALID_URL = r'https://vimeo\.com/groups/(?P[^/]+)(?:/(?!videos?/\d+)|$)' + _VALID_URL = r'https://vimeo\.com/groups/(?P[^/]+)(?:/(?!videos?/\d+)|$)' _TESTS = [{ - 'url': 'https://vimeo.com/groups/rolexawards', + 'url': 'https://vimeo.com/groups/kattykay', 'info_dict': { - 'id': 'rolexawards', - 'title': 'Rolex Awards for Enterprise', + 'id': 'kattykay', + 'title': 'Katty Kay', }, - 'playlist_mincount': 73, + 'playlist_mincount': 27, }] - - def _extract_list_title(self, webpage): - return self._og_search_title(webpage, fatal=False) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name) + _BASE_URL_TEMPL = 'https://vimeo.com/groups/%s' class VimeoReviewIE(VimeoBaseInfoExtractor): @@ -1002,7 +1009,9 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'title': "DICK HARDWICK 'Comedian'", 'uploader': 'Richard Hardwick', 'uploader_id': 'user21297594', - } + 'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks", + }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', @@ -1015,7 +1024,8 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'duration': 2773, 'thumbnail': r're:^https?://.*\.jpg$', 'uploader_id': 'user22258446', - } + }, + 'skip': 'video gone', }, { 'note': 'Password protected', 'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde', @@ -1035,33 +1045,20 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): def _real_initialize(self): self._login() - def _get_config_url(self, webpage_url, video_id, video_password_verified=False): - webpage = self._download_webpage(webpage_url, video_id) - config_url = self._html_search_regex( - r'data-config-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'config URL', default=None, group='url') - if not config_url: - data = self._parse_json(self._search_regex( - r'window\s*=\s*_extend\(window,\s*({.+?})\);', webpage, 'data', - default=NO_DEFAULT if video_password_verified else '{}'), video_id) - config = data.get('vimeo_esi', {}).get('config', {}) - config_url = config.get('configUrl') or try_get(config, lambda x: x['clipData']['configUrl']) - if config_url is None: - self._verify_video_password(webpage_url, video_id, webpage) - config_url = self._get_config_url( - webpage_url, video_id, video_password_verified=True) - return config_url - def _real_extract(self, url): page_url, video_id = re.match(self._VALID_URL, url).groups() - config_url = self._get_config_url(url, video_id) + clip_data = self._download_json( + page_url.replace('/review/', '/review/data/'), + video_id)['clipData'] + config_url = clip_data['configUrl'] config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) - source_format = self._extract_original_format(page_url, video_id) + source_format = self._extract_original_format( + page_url + '/action', video_id) if source_format: info_dict['formats'].append(source_format) self._vimeo_sort_formats(info_dict['formats']) - info_dict['id'] = video_id + info_dict['description'] = clean_html(clip_data.get('description')) return info_dict @@ -1115,94 +1112,17 @@ class VimeoLikesIE(VimeoChannelIE): return self._extract_videos(user_id, 'https://vimeo.com/%s/likes' % user_id) -class VHXEmbedIE(InfoExtractor): +class VHXEmbedIE(VimeoBaseInfoExtractor): IE_NAME = 'vhx:embed' _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P\d+)' - def _call_api(self, video_id, access_token, path='', query=None): - return self._download_json( - 'https://api.vhx.tv/videos/' + video_id + path, video_id, headers={ - 'Authorization': 'Bearer ' + access_token, - }, query=query) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - credentials = self._parse_json(self._search_regex( - r'(?s)credentials\s*:\s*({.+?}),', webpage, - 'config'), video_id, js_to_json) - access_token = credentials['access_token'] - - query = {} - for k, v in credentials.items(): - if k in ('authorization', 'authUserToken', 'ticket') and v and v != 'undefined': - if k == 'authUserToken': - query['auth_user_token'] = v - else: - query[k] = v - files = self._call_api(video_id, access_token, '/files', query) - - formats = [] - for f in files: - href = try_get(f, lambda x: x['_links']['source']['href']) - if not href: - continue - method = f.get('method') - if method == 'hls': - formats.extend(self._extract_m3u8_formats( - href, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif method == 'dash': - formats.extend(self._extract_mpd_formats( - href, video_id, mpd_id='dash', fatal=False)) - else: - fmt = { - 'filesize': int_or_none(try_get(f, lambda x: x['size']['bytes'])), - 'format_id': 'http', - 'preference': 1, - 'url': href, - 'vcodec': f.get('codec'), - } - quality = f.get('quality') - if quality: - fmt.update({ - 'format_id': 'http-' + quality, - 'height': int_or_none(self._search_regex(r'(\d+)p', quality, 'height', default=None)), - }) - formats.append(fmt) - self._sort_formats(formats) - - video_data = self._call_api(video_id, access_token) - title = video_data.get('title') or video_data['name'] - - subtitles = {} - for subtitle in try_get(video_data, lambda x: x['tracks']['subtitles'], list) or []: - lang = subtitle.get('srclang') or subtitle.get('label') - for _link in subtitle.get('_links', {}).values(): - href = _link.get('href') - if not href: - continue - subtitles.setdefault(lang, []).append({ - 'url': href, - }) - - q = qualities(['small', 'medium', 'large', 'source']) - thumbnails = [] - for thumbnail_id, thumbnail_url in video_data.get('thumbnail', {}).items(): - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'preference': q(thumbnail_id), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'duration': int_or_none(try_get(video_data, lambda x: x['duration']['seconds'])), - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - 'timestamp': unified_timestamp(video_data.get('created_at')), - 'view_count': int_or_none(video_data.get('plays_count')), - } + config_url = self._parse_json(self._search_regex( + r'window\.OTTData\s*=\s*({.+})', webpage, + 'ott data'), video_id, js_to_json)['config_url'] + config = self._download_json(config_url, video_id) + info = self._parse_config(config, video_id) + self._vimeo_sort_formats(info['formats']) + return info diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index f57ed2288..00ec006c4 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import collections +import functools import re from .common import InfoExtractor @@ -11,8 +12,8 @@ from ..utils import ( ExtractorError, get_element_by_class, int_or_none, + OnDemandPagedList, orderedSet, - remove_start, str_or_none, str_to_int, unescapeHTML, @@ -21,6 +22,7 @@ from ..utils import ( urlencode_postdata, ) from .dailymotion import DailymotionIE +from .odnoklassniki import OdnoklassnikiIE from .pladform import PladformIE from .vimeo import VimeoIE from .youtube import YoutubeIE @@ -60,6 +62,18 @@ class VKBaseIE(InfoExtractor): def _real_initialize(self): self._login() + def _download_payload(self, path, video_id, data, fatal=True): + data['al'] = 1 + code, payload = self._download_json( + 'https://vk.com/%s.php' % path, video_id, + data=urlencode_postdata(data), fatal=fatal, + headers={'X-Requested-With': 'XMLHttpRequest'})['payload'] + if code == '3': + self.raise_login_required() + elif code == '8': + raise ExtractorError(clean_html(payload[0][1:-1]), expected=True) + return payload + class VKIE(VKBaseIE): IE_NAME = 'vk' @@ -96,7 +110,6 @@ class VKIE(VKBaseIE): }, { 'url': 'http://vk.com/video205387401_165548505', - 'md5': '6c0aeb2e90396ba97035b9cbde548700', 'info_dict': { 'id': '205387401_165548505', 'ext': 'mp4', @@ -110,18 +123,18 @@ class VKIE(VKBaseIE): }, { 'note': 'Embedded video', - 'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', - 'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', + 'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa', + 'md5': '7babad3b85ea2e91948005b1b8b0cb84', 'info_dict': { - 'id': '32194266_162925554', + 'id': '-77521_162222515', 'ext': 'mp4', - 'uploader': 'Vladimir Gavrin', - 'title': 'Lin Dan', - 'duration': 101, - 'upload_date': '20120730', - 'view_count': int, + 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', + 'title': 'ProtivoGunz - Хуёвая песня', + 'duration': 195, + 'upload_date': '20120212', + 'timestamp': 1329049880, + 'uploader_id': '-77521', }, - 'skip': 'This video has been removed from public access.', }, { # VIDEO NOW REMOVED @@ -138,18 +151,19 @@ class VKIE(VKBaseIE): 'upload_date': '20121218', 'view_count': int, }, - 'skip': 'Requires vk account credentials', + 'skip': 'Removed', }, { 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', - 'md5': '4d7a5ef8cf114dfa09577e57b2993202', 'info_dict': { 'id': '-43215063_168067957', 'ext': 'mp4', - 'uploader': 'Киномания - лучшее из мира кино', + 'uploader': 'Bro Mazter', 'title': ' ', 'duration': 7291, 'upload_date': '20140328', + 'uploader_id': '223413403', + 'timestamp': 1396018030, }, 'skip': 'Requires vk account credentials', }, @@ -165,7 +179,7 @@ class VKIE(VKBaseIE): 'upload_date': '20140626', 'view_count': int, }, - 'skip': 'Only works from Russia', + 'skip': 'Removed', }, { # video (removed?) only available with list id @@ -204,8 +218,7 @@ class VKIE(VKBaseIE): 'id': 'k3lz2cmXyRuJQSjGHUv', 'ext': 'mp4', 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', - # TODO: fix test by fixing dailymotion description extraction - 'description': 'md5:c651358f03c56f1150b555c26d90a0fd', + 'description': 'md5:424b8e88cc873217f520e582ba28bb36', 'uploader': 'AniLibria.Tv', 'upload_date': '20160914', 'uploader_id': 'x1p5vl5', @@ -247,6 +260,9 @@ class VKIE(VKBaseIE): 'uploader_id': '-387766', 'timestamp': 1475137527, }, + 'params': { + 'skip_download': True, + }, }, { # live stream, hls and rtmp links, most likely already finished live @@ -288,80 +304,94 @@ class VKIE(VKBaseIE): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') + mv_data = {} if video_id: - info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id + data = { + 'act': 'show_inline', + 'video': video_id, + } # Some videos (removed?) can only be downloaded with list id specified list_id = mobj.group('list_id') if list_id: - info_url += '&list=%s' % list_id + data['list'] = list_id + + payload = self._download_payload('al_video', video_id, data) + info_page = payload[1] + opts = payload[-1] + mv_data = opts.get('mvData') or {} + player = opts.get('player') or {} else: - info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query') video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) - info_page = self._download_webpage(info_url, video_id) + info_page = self._download_webpage( + 'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id) - error_message = self._html_search_regex( - [r'(?s)]+class="video_layer_message"[^>]*>(.+?)', - r'(?s)]+id="video_ext_msg"[^>]*>(.+?)'], - info_page, 'error message', default=None) - if error_message: - raise ExtractorError(error_message, expected=True) + error_message = self._html_search_regex( + [r'(?s)]+class="video_layer_message"[^>]*>(.+?)', + r'(?s)]+id="video_ext_msg"[^>]*>(.+?)'], + info_page, 'error message', default=None) + if error_message: + raise ExtractorError(error_message, expected=True) - if re.search(r'/login\.php\?.*\bact=security_check', info_page): - raise ExtractorError( - 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', - expected=True) + if re.search(r'/login\.php\?.*\bact=security_check', info_page): + raise ExtractorError( + 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', + expected=True) - ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' + ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' - ERRORS = { - r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': - ERROR_COPYRIGHT, + ERRORS = { + r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': + ERROR_COPYRIGHT, - r'>The video .*? was removed from public access by request of the copyright holder.<': - ERROR_COPYRIGHT, + r'>The video .*? was removed from public access by request of the copyright holder.<': + ERROR_COPYRIGHT, - r'Please log in or <': - 'Video %s is only available for registered users, ' - 'use --username and --password options to provide account credentials.', + r'Please log in or <': + 'Video %s is only available for registered users, ' + 'use --username and --password options to provide account credentials.', - r'Unknown error': - 'Video %s does not exist.', + r'Unknown error': + 'Video %s does not exist.', - r'Видео временно недоступно': - 'Video %s is temporarily unavailable.', + r'Видео временно недоступно': + 'Video %s is temporarily unavailable.', - r'Access denied': - 'Access denied to video %s.', + r'Access denied': + 'Access denied to video %s.', - r'Видеозапись недоступна, так как её автор был заблокирован.': - 'Video %s is no longer available, because its author has been blocked.', + r'Видеозапись недоступна, так как её автор был заблокирован.': + 'Video %s is no longer available, because its author has been blocked.', - r'This video is no longer available, because its author has been blocked.': - 'Video %s is no longer available, because its author has been blocked.', + r'This video is no longer available, because its author has been blocked.': + 'Video %s is no longer available, because its author has been blocked.', - r'This video is no longer available, because it has been deleted.': - 'Video %s is no longer available, because it has been deleted.', + r'This video is no longer available, because it has been deleted.': + 'Video %s is no longer available, because it has been deleted.', - r'The video .+? is not available in your region.': - 'Video %s is not available in your region.', - } + r'The video .+? is not available in your region.': + 'Video %s is not available in your region.', + } - for error_re, error_msg in ERRORS.items(): - if re.search(error_re, info_page): - raise ExtractorError(error_msg % video_id, expected=True) + for error_re, error_msg in ERRORS.items(): + if re.search(error_re, info_page): + raise ExtractorError(error_msg % video_id, expected=True) + + player = self._parse_json(self._search_regex( + r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', + info_page, 'player params'), video_id) youtube_url = YoutubeIE._extract_url(info_page) if youtube_url: - return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + return self.url_result(youtube_url, YoutubeIE.ie_key()) vimeo_url = VimeoIE._extract_url(url, info_page) if vimeo_url is not None: - return self.url_result(vimeo_url) + return self.url_result(vimeo_url, VimeoIE.ie_key()) pladform_url = PladformIE._extract_url(info_page) if pladform_url: - return self.url_result(pladform_url) + return self.url_result(pladform_url, PladformIE.ie_key()) m_rutube = re.search( r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page) @@ -374,6 +404,10 @@ class VKIE(VKBaseIE): if dailymotion_urls: return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) + odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page) + if odnoklassniki_url: + return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) @@ -383,29 +417,7 @@ class VKIE(VKBaseIE): opts_url = 'http:' + opts_url return self.url_result(opts_url) - # vars does not look to be served anymore since 24.10.2016 - data = self._parse_json( - self._search_regex( - r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'), - video_id, fatal=False) - - # is served instead - if not data: - data = self._parse_json( - self._search_regex( - [r'\s*({.+?})\s*', r'\s*({.+})'], - info_page, 'json', default='{}'), - video_id) - if data: - data = data['player']['params'][0] - - if not data: - data = self._parse_json( - self._search_regex( - r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page, - 'player params'), - video_id)['params'][0] - + data = player['params'][0] title = unescapeHTML(data['md_title']) # 2 = live @@ -454,12 +466,12 @@ class VKIE(VKBaseIE): 'title': title, 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), - 'uploader_id': str_or_none(data.get('author_id')), - 'duration': data.get('duration'), + 'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')), + 'duration': int_or_none(data.get('duration') or mv_data.get('duration')), 'timestamp': timestamp, 'view_count': view_count, - 'like_count': int_or_none(data.get('liked')), - 'dislike_count': int_or_none(data.get('nolikes')), + 'like_count': int_or_none(mv_data.get('likes')), + 'comment_count': int_or_none(mv_data.get('commcount')), 'is_live': is_live, } @@ -467,15 +479,23 @@ class VKIE(VKBaseIE): class VKUserVideosIE(VKBaseIE): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P
\w+))?|$)' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ - 'url': 'http://vk.com/videos205387401', + 'url': 'https://vk.com/videos-767561', 'info_dict': { - 'id': '205387401', - 'title': "Tom Cruise's Videos", + 'id': '-767561_all', }, - 'playlist_mincount': 4, + 'playlist_mincount': 1150, + }, { + 'url': 'https://vk.com/videos-767561?section=uploaded', + 'info_dict': { + 'id': '-767561_uploaded', + }, + 'playlist_mincount': 425, + }, { + 'url': 'http://vk.com/videos205387401', + 'only_matching': True, }, { 'url': 'http://vk.com/videos-77521', 'only_matching': True, @@ -489,22 +509,33 @@ class VKUserVideosIE(VKBaseIE): 'url': 'http://new.vk.com/videos205387401', 'only_matching': True, }] + _PAGE_SIZE = 1000 + _VIDEO = collections.namedtuple('Video', ['owner_id', 'id']) + + def _fetch_page(self, page_id, section, page): + l = self._download_payload('al_video', page_id, { + 'act': 'load_videos_silent', + 'offset': page * self._PAGE_SIZE, + 'oid': page_id, + 'section': section, + })[0][section]['list'] + + for video in l: + v = self._VIDEO._make(video[:2]) + video_id = '%d_%d' % (v.owner_id, v.id) + yield self.url_result( + 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id) def _real_extract(self, url): - page_id = self._match_id(url) + page_id, section = re.match(self._VALID_URL, url).groups() + if not section: + section = 'all' - webpage = self._download_webpage(url, page_id) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, page_id, section), + self._PAGE_SIZE) - entries = [ - self.url_result( - 'http://vk.com/video' + video_id, 'VK', video_id=video_id) - for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))] - - title = unescapeHTML(self._search_regex( - r'\s*([^<]+?)\s+\|\s+\d+\s+videos', - webpage, 'title', default=page_id)) - - return self.playlist_result(entries, page_id, title) + return self.playlist_result(entries, '%s_%s' % (page_id, section)) class VKWallPostIE(VKBaseIE): @@ -514,15 +545,15 @@ class VKWallPostIE(VKBaseIE): # public page URL, audio playlist 'url': 'https://vk.com/bs.official?w=wall-23538238_35', 'info_dict': { - 'id': '23538238_35', - 'title': 'Black Shadow - Wall post 23538238_35', + 'id': '-23538238_35', + 'title': 'Black Shadow - Wall post -23538238_35', 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', }, 'playlist': [{ 'md5': '5ba93864ec5b85f7ce19a9af4af080f6', 'info_dict': { 'id': '135220665_111806521', - 'ext': 'mp3', + 'ext': 'mp4', 'title': 'Black Shadow - Слепое Верование', 'duration': 370, 'uploader': 'Black Shadow', @@ -533,18 +564,16 @@ class VKWallPostIE(VKBaseIE): 'md5': '4cc7e804579122b17ea95af7834c9233', 'info_dict': { 'id': '135220665_111802303', - 'ext': 'mp3', + 'ext': 'mp4', 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', 'duration': 423, 'uploader': 'Black Shadow', 'artist': 'Black Shadow', 'track': 'Война - Негасимое Бездны Пламя!', }, - 'params': { - 'skip_download': True, - }, }], 'params': { + 'skip_download': True, 'usenetrc': True, }, 'skip': 'Requires vk account credentials', @@ -553,7 +582,7 @@ class VKWallPostIE(VKBaseIE): 'url': 'https://vk.com/wall85155021_6319', 'info_dict': { 'id': '85155021_6319', - 'title': 'Sergey Gorbunov - Wall post 85155021_6319', + 'title': 'Сергей Горбунов - Wall post 85155021_6319', }, 'playlist_count': 1, 'params': { @@ -569,58 +598,72 @@ class VKWallPostIE(VKBaseIE): 'url': 'https://m.vk.com/wall-23538238_35', 'only_matching': True, }] + _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/=' + _AUDIO = collections.namedtuple('Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads']) + + def _decode(self, enc): + dec = '' + e = n = 0 + for c in enc: + r = self._BASE64_CHARS.index(c) + cond = n % 4 + e = 64 * e + r if cond else r + n += 1 + if cond: + dec += chr(255 & e >> (-2 * n & 6)) + return dec + + def _unmask_url(self, mask_url, vk_id): + if 'audio_api_unavailable' in mask_url: + extra = mask_url.split('?extra=')[1].split('#') + func, base = self._decode(extra[1]).split(chr(11)) + mask_url = list(self._decode(extra[0])) + url_len = len(mask_url) + indexes = [None] * url_len + index = int(base) ^ vk_id + for n in range(url_len - 1, -1, -1): + index = (url_len * (n + 1) ^ index + n) % url_len + indexes[n] = index + for n in range(1, url_len): + c = mask_url[n] + index = indexes[url_len - 1 - n] + mask_url[n] = mask_url[index] + mask_url[index] = c + mask_url = ''.join(mask_url) + return mask_url def _real_extract(self, url): post_id = self._match_id(url) - wall_url = 'https://vk.com/wall%s' % post_id - - post_id = remove_start(post_id, '-') - - webpage = self._download_webpage(wall_url, post_id) - - error = self._html_search_regex( - r'>Error</div>\s*<div[^>]+class=["\']body["\'][^>]*>([^<]+)', - webpage, 'error', default=None) - if error: - raise ExtractorError('VK said: %s' % error, expected=True) + webpage = self._download_payload('wkview', post_id, { + 'act': 'show', + 'w': 'wall' + post_id, + })[1] description = clean_html(get_element_by_class('wall_post_text', webpage)) uploader = clean_html(get_element_by_class('author', webpage)) - thumbnail = self._og_search_thumbnail(webpage) entries = [] - audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage) - if audio_ids: - al_audio = self._download_webpage( - 'https://vk.com/al_audio.php', post_id, - note='Downloading audio info', fatal=False, - data=urlencode_postdata({ - 'act': 'reload_audio', - 'al': '1', - 'ids': ','.join(audio_ids) - })) - if al_audio: - Audio = collections.namedtuple( - 'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration']) - audios = self._parse_json( - self._search_regex( - r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'), - post_id, fatal=False, transform_source=unescapeHTML) - if isinstance(audios, list): - for audio in audios: - a = Audio._make(audio[:6]) - entries.append({ - 'id': '%s_%s' % (a.user_id, a.id), - 'url': a.url, - 'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id, - 'thumbnail': thumbnail, - 'duration': a.duration, - 'uploader': uploader, - 'artist': a.artist, - 'track': a.track, - }) + for audio in re.findall(r'data-audio="([^"]+)', webpage): + audio = self._parse_json(unescapeHTML(audio), post_id) + a = self._AUDIO._make(audio[:16]) + if not a.url: + continue + title = unescapeHTML(a.title) + performer = unescapeHTML(a.performer) + entries.append({ + 'id': '%s_%s' % (a.owner_id, a.id), + 'url': self._unmask_url(a.url, a.ads['vk_id']), + 'title': '%s - %s' % (performer, title) if performer else title, + 'thumbnails': [{'url': c_url} for c_url in a.cover_url.split(',')] if a.cover_url else None, + 'duration': int_or_none(a.duration), + 'uploader': uploader, + 'artist': performer, + 'track': title, + 'ext': 'mp4', + 'protocol': 'm3u8', + }) for video in re.finditer( r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py index 3336e6c15..b7d02fca3 100644 --- a/youtube_dl/extractor/vzaar.py +++ b/youtube_dl/extractor/vzaar.py @@ -32,6 +32,18 @@ class VzaarIE(InfoExtractor): 'ext': 'mp3', 'title': 'MP3', }, + }, { + # hlsAes = true + 'url': 'https://view.vzaar.com/11379930/player', + 'info_dict': { + 'id': '11379930', + 'ext': 'mp4', + 'title': 'Videoaula', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { # with null videoTitle 'url': 'https://view.vzaar.com/20313539/download', @@ -58,6 +70,7 @@ class VzaarIE(InfoExtractor): f = { 'url': source_url, 'format_id': 'http', + 'preference': 1, } if 'audio' in source_url: f.update({ @@ -75,13 +88,17 @@ class VzaarIE(InfoExtractor): video_guid = video_data.get('guid') usp = video_data.get('usp') - if isinstance(video_guid, compat_str) and isinstance(usp, dict): - m3u8_url = ('http://fable.vzaar.com/v4/usp/%s/%s.ism/.m3u8?' - % (video_guid, video_id)) + '&'.join( - '%s=%s' % (k, v) for k, v in usp.items()) - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + if video_data.get('uspEnabled') and isinstance(video_guid, compat_str) and isinstance(usp, dict): + hls_aes = video_data.get('hlsAes') + qs = '&'.join('%s=%s' % (k, v) for k, v in usp.items()) + url_templ = 'http://%%s.vzaar.com/v5/usp%s/%s/%s.ism%%s?' % ('aes' if hls_aes else '', video_guid, video_id) + m3u8_formats = self._extract_m3u8_formats( + url_templ % ('fable', '/.m3u8') + qs, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + if hls_aes: + for f in m3u8_formats: + f['_decryption_key_url'] = url_templ % ('goose', '') + qs + formats.extend(m3u8_formats) self._sort_formats(formats) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py deleted file mode 100644 index ea234e3c5..000000000 --- a/youtube_dl/extractor/wimp.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .youtube import YoutubeIE - - -class WimpIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P<id>[^/]+)' - _TESTS = [{ - 'url': 'http://www.wimp.com/maru-is-exhausted/', - 'md5': 'ee21217ffd66d058e8b16be340b74883', - 'info_dict': { - 'id': 'maru-is-exhausted', - 'ext': 'mp4', - 'title': 'Maru is exhausted.', - 'description': 'md5:57e099e857c0a4ea312542b684a869b8', - } - }, { - 'url': 'http://www.wimp.com/clowncar/', - 'md5': '5c31ad862a90dc5b1f023956faec13fe', - 'info_dict': { - 'id': 'cG4CEr2aiSg', - 'ext': 'webm', - 'title': 'Basset hound clown car...incredible!', - 'description': '5 of my Bassets crawled in this dog loo! www.bellinghambassets.com\n\nFor licensing/usage please contact: licensing(at)jukinmediadotcom', - 'upload_date': '20140303', - 'uploader': 'Gretchen Hoey', - 'uploader_id': 'gretchenandjeff1', - }, - 'add_ie': ['Youtube'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - youtube_id = self._search_regex( - (r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", - r'data-id=["\']([0-9A-Za-z_-]{11})'), - webpage, 'video URL', default=None) - if youtube_id: - return self.url_result(youtube_id, YoutubeIE.ie_key()) - - info_dict = self._extract_jwplayer_data( - webpage, video_id, require_title=False) - - info_dict.update({ - 'id': video_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - }) - - return info_dict diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index fa142b974..085514d47 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -12,9 +12,8 @@ from ..utils import ( class WistiaIE(InfoExtractor): - _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P<id>[a-z0-9]+)' - _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' - _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' + _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P<id>[a-z0-9]{10})' + _EMBED_BASE_URL = 'http://fast.wistia.com/embed/' _TESTS = [{ 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', @@ -43,33 +42,34 @@ class WistiaIE(InfoExtractor): 'only_matching': True, }] + # https://wistia.com/support/embed-and-share/video-on-your-website @staticmethod def _extract_url(webpage): match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage) if match: return unescapeHTML(match.group('url')) - match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) - if match: - return 'wistia:%s' % match.group('id') - match = re.search( r'''(?sx) <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? - <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 + <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]{10})\b.*?\2 ''', webpage) if match: return 'wistia:%s' % match.group('id') + match = re.search(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage) + if match: + return 'wistia:%s' % match.group('id') + def _real_extract(self, url): video_id = self._match_id(url) data_json = self._download_json( - self._API_URL % video_id, video_id, + self._EMBED_BASE_URL + 'medias/%s.json' % video_id, video_id, # Some videos require this. headers={ - 'Referer': url if url.startswith('http') else self._IFRAME_URL % video_id, + 'Referer': url if url.startswith('http') else self._EMBED_BASE_URL + 'iframe/' + video_id, }) if data_json.get('error'): @@ -94,27 +94,61 @@ class WistiaIE(InfoExtractor): 'url': aurl, 'width': int_or_none(a.get('width')), 'height': int_or_none(a.get('height')), + 'filesize': int_or_none(a.get('size')), }) else: aext = a.get('ext') - is_m3u8 = a.get('container') == 'm3u8' or aext == 'm3u8' - formats.append({ - 'format_id': atype, + display_name = a.get('display_name') + format_id = atype + if atype and atype.endswith('_video') and display_name: + format_id = '%s-%s' % (atype[:-6], display_name) + f = { + 'format_id': format_id, 'url': aurl, - 'tbr': int_or_none(a.get('bitrate')), - 'vbr': int_or_none(a.get('opt_vbitrate')), - 'width': int_or_none(a.get('width')), - 'height': int_or_none(a.get('height')), - 'filesize': int_or_none(a.get('size')), - 'vcodec': a.get('codec'), - 'container': a.get('container'), - 'ext': 'mp4' if is_m3u8 else aext, - 'protocol': 'm3u8' if is_m3u8 else None, + 'tbr': int_or_none(a.get('bitrate')) or None, 'preference': 1 if atype == 'original' else None, - }) + } + if display_name == 'Audio': + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), + 'vcodec': a.get('codec'), + }) + if a.get('container') == 'm3u8' or aext == 'm3u8': + ts_f = f.copy() + ts_f.update({ + 'ext': 'ts', + 'format_id': f['format_id'].replace('hls-', 'ts-'), + 'url': f['url'].replace('.bin', '.ts'), + }) + formats.append(ts_f) + f.update({ + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }) + else: + f.update({ + 'container': a.get('container'), + 'ext': aext, + 'filesize': int_or_none(a.get('size')), + }) + formats.append(f) self._sort_formats(formats) + subtitles = {} + for caption in data.get('captions', []): + language = caption.get('language') + if not language: + continue + subtitles[language] = [{ + 'url': self._EMBED_BASE_URL + 'captions/' + video_id + '.vtt?language=' + language, + }] + return { 'id': video_id, 'title': title, @@ -123,4 +157,5 @@ class WistiaIE(InfoExtractor): 'thumbnails': thumbnails, 'duration': float_or_none(data.get('duration')), 'timestamp': int_or_none(data.get('createdAt')), + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index b38c7a7b3..48ef07ed1 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -4,37 +4,64 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_chr from ..utils import ( decode_packed_codes, determine_ext, ExtractorError, int_or_none, - NO_DEFAULT, + js_to_json, urlencode_postdata, ) +# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58 +def aa_decode(aa_code): + symbol_table = [ + ('7', '((゚ー゚) + (o^_^o))'), + ('6', '((o^_^o) +(o^_^o))'), + ('5', '((゚ー゚) + (゚Θ゚))'), + ('2', '((o^_^o) - (゚Θ゚))'), + ('4', '(゚ー゚)'), + ('3', '(o^_^o)'), + ('1', '(゚Θ゚)'), + ('0', '(c^_^o)'), + ] + delim = '(゚Д゚)[゚ε゚]+' + ret = '' + for aa_char in aa_code.split(delim): + for val, pat in symbol_table: + aa_char = aa_char.replace(pat, val) + aa_char = aa_char.replace('+ ', '') + m = re.match(r'^\d+', aa_char) + if m: + ret += compat_chr(int(m.group(0), 8)) + else: + m = re.match(r'^u([\da-f]+)', aa_char) + if m: + ret += compat_chr(int(m.group(1), 16)) + return ret + + class XFileShareIE(InfoExtractor): _SITES = ( - (r'daclips\.(?:in|com)', 'DaClips'), - (r'filehoot\.com', 'FileHoot'), - (r'gorillavid\.(?:in|com)', 'GorillaVid'), - (r'movpod\.in', 'MovPod'), - (r'powerwatch\.pw', 'PowerWatch'), - (r'rapidvideo\.ws', 'Rapidvideo.ws'), + (r'clipwatching\.com', 'ClipWatching'), + (r'gounlimited\.to', 'GoUnlimited'), + (r'govid\.me', 'GoVid'), + (r'holavid\.com', 'HolaVid'), + (r'streamty\.com', 'Streamty'), (r'thevideobee\.to', 'TheVideoBee'), - (r'vidto\.(?:me|se)', 'Vidto'), - (r'streamin\.to', 'Streamin.To'), - (r'xvidstage\.com', 'XVIDSTAGE'), - (r'vidabc\.com', 'Vid ABC'), + (r'uqload\.com', 'Uqload'), (r'vidbom\.com', 'VidBom'), (r'vidlo\.us', 'vidlo'), - (r'rapidvideo\.(?:cool|org)', 'RapidVideo.TV'), - (r'fastvideo\.me', 'FastVideo.me'), + (r'vidlocker\.xyz', 'VidLocker'), + (r'vidshare\.tv', 'VidShare'), + (r'vup\.to', 'VUp'), + (r'xvideosharing\.com', 'XVideoSharing'), ) IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) - _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' % '|'.join(site for site in list(zip(*_SITES))[0])) _FILE_NOT_FOUND_REGEXES = ( @@ -43,82 +70,14 @@ class XFileShareIE(InfoExtractor): ) _TESTS = [{ - 'url': 'http://gorillavid.in/06y9juieqpmi', - 'md5': '5ae4a3580620380619678ee4875893ba', + 'url': 'http://xvideosharing.com/fq65f94nd2ve', + 'md5': '4181f63957e8fe90ac836fa58dc3c8a6', 'info_dict': { - 'id': '06y9juieqpmi', + 'id': 'fq65f94nd2ve', 'ext': 'mp4', - 'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ', + 'title': 'sample', 'thumbnail': r're:http://.*\.jpg', }, - }, { - 'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html', - 'only_matching': True, - }, { - 'url': 'http://daclips.in/3rso4kdn6f9m', - 'md5': '1ad8fd39bb976eeb66004d3a4895f106', - 'info_dict': { - 'id': '3rso4kdn6f9m', - 'ext': 'mp4', - 'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc', - 'thumbnail': r're:http://.*\.jpg', - } - }, { - 'url': 'http://movpod.in/0wguyyxi1yca', - 'only_matching': True, - }, { - 'url': 'http://filehoot.com/3ivfabn7573c.html', - 'info_dict': { - 'id': '3ivfabn7573c', - 'ext': 'mp4', - 'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4', - 'thumbnail': r're:http://.*\.jpg', - }, - 'skip': 'Video removed', - }, { - 'url': 'http://vidto.me/ku5glz52nqe1.html', - 'info_dict': { - 'id': 'ku5glz52nqe1', - 'ext': 'mp4', - 'title': 'test' - } - }, { - 'url': 'http://powerwatch.pw/duecjibvicbu', - 'info_dict': { - 'id': 'duecjibvicbu', - 'ext': 'mp4', - 'title': 'Big Buck Bunny trailer', - }, - }, { - 'url': 'http://xvidstage.com/e0qcnl03co6z', - 'info_dict': { - 'id': 'e0qcnl03co6z', - 'ext': 'mp4', - 'title': 'Chucky Prank 2015.mp4', - }, - }, { - # removed by administrator - 'url': 'http://xvidstage.com/amfy7atlkx25', - 'only_matching': True, - }, { - 'url': 'http://vidabc.com/i8ybqscrphfv', - 'info_dict': { - 'id': 'i8ybqscrphfv', - 'ext': 'mp4', - 'title': 're:Beauty and the Beast 2017', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.rapidvideo.cool/b667kprndr8w', - 'only_matching': True, - }, { - 'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html', - 'only_matching': True, - }, { - 'url': 'http://vidto.se/1tx1pf6t12cg.html', - 'only_matching': True, }] @staticmethod @@ -131,10 +90,9 @@ class XFileShareIE(InfoExtractor): webpage)] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + host, video_id = re.match(self._VALID_URL, url).groups() - url = 'http://%s/%s' % (mobj.group('host'), video_id) + url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) webpage = self._download_webpage(url, video_id) if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES): @@ -142,7 +100,7 @@ class XFileShareIE(InfoExtractor): fields = self._hidden_inputs(webpage) - if fields['op'] == 'download1': + if fields.get('op') == 'download1': countdown = int_or_none(self._search_regex( r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>', webpage, 'countdown', default=None)) @@ -160,13 +118,37 @@ class XFileShareIE(InfoExtractor): (r'style="z-index: [0-9]+;">([^<]+)</span>', r'<td nowrap>([^<]+)</td>', r'h4-fine[^>]*>([^<]+)<', - r'>Watch (.+) ', + r'>Watch (.+)[ <]', r'<h2 class="video-page-head">([^<]+)</h2>', - r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<'), # streamin.to + r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to + r'title\s*:\s*"([^"]+)"'), # govid.me webpage, 'title', default=None) or self._og_search_title( webpage, default=None) or video_id).strip() - def extract_formats(default=NO_DEFAULT): + for regex, func in ( + (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes), + (r'(゚.+)', aa_decode)): + obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) + if obf_code: + webpage = webpage.replace(obf_code, func(obf_code)) + + formats = [] + + jwplayer_data = self._search_regex( + [ + r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);', + r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);', + ], webpage, + 'jwplayer data', default=None) + if jwplayer_data: + jwplayer_data = self._parse_json( + jwplayer_data.replace(r"\'", "'"), video_id, js_to_json) + if jwplayer_data: + formats = self._parse_jwplayer_data( + jwplayer_data, video_id, False, + m3u8_id='hls', mpd_id='dash')['formats'] + + if not formats: urls = [] for regex in ( r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', @@ -177,6 +159,12 @@ class XFileShareIE(InfoExtractor): video_url = mobj.group('url') if video_url not in urls: urls.append(video_url) + + sources = self._search_regex( + r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None) + if sources: + urls.extend(self._parse_json(sources, video_id)) + formats = [] for video_url in urls: if determine_ext(video_url) == 'm3u8': @@ -189,21 +177,13 @@ class XFileShareIE(InfoExtractor): 'url': video_url, 'format_id': 'sd', }) - if not formats and default is not NO_DEFAULT: - return default - self._sort_formats(formats) - return formats - - formats = extract_formats(default=None) - - if not formats: - webpage = decode_packed_codes(self._search_regex( - r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))", - webpage, 'packed code')) - formats = extract_formats() + self._sort_formats(formats) thumbnail = self._search_regex( - r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None) + [ + r'<video[^>]+poster="([^"]+)"', + r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', + ], webpage, 'thumbnail', default=None) return { 'id': video_id, diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 166bcf443..8fc64914c 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -17,7 +17,8 @@ class XVideosIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?xvideos\.com/video| + (?:[^/]+\.)?xvideos2?\.com/video| + (?:www\.)?xvideos\.es/video| flashservice\.xvideos\.com/embedframe/| static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= ) @@ -39,6 +40,42 @@ class XVideosIE(InfoExtractor): }, { 'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=4588838', 'only_matching': True, + }, { + 'url': 'http://xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://xvideos.es/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://www.xvideos.es/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'http://xvideos.es/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'http://www.xvideos.es/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'http://fr.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://fr.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'http://it.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://it.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'http://de.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://de.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index e5ebdd180..238d9cea0 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -3,453 +3,313 @@ from __future__ import unicode_literals import hashlib import itertools -import json import re from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_str, compat_urllib_parse, - compat_urlparse, ) from ..utils import ( clean_html, - determine_ext, - ExtractorError, - extract_attributes, int_or_none, mimetype2ext, + parse_iso8601, smuggle_url, try_get, - unescapeHTML, url_or_none, ) -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nbc import NBCSportsVPlayerIE +from .brightcove import BrightcoveNewIE class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?P<host>https?://(?:(?P<country>[a-zA-Z]{2})\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?:(?P<display_id>.+)?-)?(?P<id>[0-9]+)(?:-[a-z]+)?(?:\.html)?' - _TESTS = [ - { - 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - 'info_dict': { - 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', - 'ext': 'mp4', - 'title': 'Julian Smith & Travis Legg Watch Julian Smith', - 'description': 'Julian and Travis watch Julian Smith', - 'duration': 6863, - }, + _VALID_URL = r'(?P<url>https?://(?:(?P<country>[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P<id>[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)' + _TESTS = [{ + 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', + 'info_dict': { + 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', + 'ext': 'mp4', + 'title': 'Julian Smith & Travis Legg Watch Julian Smith', + 'description': 'Julian and Travis watch Julian Smith', + 'duration': 6863, + 'timestamp': 1369812016, + 'upload_date': '20130529', }, - { - 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - 'md5': '251af144a19ebc4a033e8ba91ac726bb', - 'info_dict': { - 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', - 'ext': 'mp4', - 'title': 'Codefellas - The Cougar Lies with Spanish Moss', - 'description': 'md5:66b627ab0a282b26352136ca96ce73c1', - 'duration': 151, - }, - 'skip': 'HTTP Error 404', + }, { + 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', + 'md5': '7993e572fac98e044588d0b5260f4352', + 'info_dict': { + 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', + 'ext': 'mp4', + 'title': "Yahoo Saves 'Community'", + 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', + 'duration': 170, + 'timestamp': 1406838636, + 'upload_date': '20140731', }, - { - 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', - 'md5': '7993e572fac98e044588d0b5260f4352', - 'info_dict': { - 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', - 'ext': 'mp4', - 'title': "Yahoo Saves 'Community'", - 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', - 'duration': 170, - } - }, - { - 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html', - 'md5': '45c024bad51e63e9b6f6fad7a43a8c23', - 'info_dict': { - 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f', - 'ext': 'mp4', - 'title': '敢問市長/黃秀霜批賴清德「非常高傲」', - 'description': '直言台南沒捷運 交通居五都之末', - 'duration': 396, - }, - }, - { - 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', - 'md5': '71298482f7c64cbb7fa064e4553ff1c1', - 'info_dict': { - 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', - 'ext': 'webm', - 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', - 'description': 'md5:f66c890e1490f4910a9953c941dee944', - 'duration': 97, - } - }, - { - 'url': 'https://ca.sports.yahoo.com/video/program-makes-hockey-more-affordable-013127711.html', - 'md5': '57e06440778b1828a6079d2f744212c4', - 'info_dict': { - 'id': 'c9fa2a36-0d4d-3937-b8f6-cc0fb1881e73', - 'ext': 'mp4', - 'title': 'Program that makes hockey more affordable not offered in Manitoba', - 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4', - 'duration': 121, - }, - 'skip': 'Video gone', - }, { - 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html', - 'info_dict': { - 'id': '154609075', - }, - 'playlist': [{ - 'md5': '000887d0dc609bc3a47c974151a40fb8', - 'info_dict': { - 'id': 'e624c4bc-3389-34de-9dfc-025f74943409', - 'ext': 'mp4', - 'title': '\'The Interview\' TV Spot: War', - 'description': 'The Interview', - 'duration': 30, - }, - }, { - 'md5': '81bc74faf10750fe36e4542f9a184c66', - 'info_dict': { - 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9', - 'ext': 'mp4', - 'title': '\'The Interview\' TV Spot: Guys', - 'description': 'The Interview', - 'duration': 30, - }, - }], - }, { - 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', - 'md5': '88e209b417f173d86186bef6e4d1f160', - 'info_dict': { - 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', - 'ext': 'mp4', - 'title': 'China Moses Is Crazy About the Blues', - 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', - 'duration': 128, - } - }, { - 'url': 'https://in.lifestyle.yahoo.com/video/connect-dots-dark-side-virgo-090247395.html', - 'md5': 'd9a083ccf1379127bf25699d67e4791b', - 'info_dict': { - 'id': '52aeeaa3-b3d1-30d8-9ef8-5d0cf05efb7c', - 'ext': 'mp4', - 'title': 'Connect the Dots: Dark Side of Virgo', - 'description': 'md5:1428185051cfd1949807ad4ff6d3686a', - 'duration': 201, - }, - 'skip': 'Domain name in.lifestyle.yahoo.com gone', - }, { - 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', - 'md5': '989396ae73d20c6f057746fb226aa215', - 'info_dict': { - 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', - 'ext': 'mp4', - 'title': '\'True Story\' Trailer', - 'description': 'True Story', - 'duration': 150, - }, - }, { - 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', - 'only_matching': True, - }, { - 'note': 'NBC Sports embeds', - 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', - 'info_dict': { - 'id': '9CsDKds0kvHI', - 'ext': 'flv', - 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', - 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', - 'upload_date': '20150313', - 'uploader': 'NBCU-SPORTS', - 'timestamp': 1426270238, - } - }, { - 'url': 'https://tw.news.yahoo.com/-100120367.html', - 'only_matching': True, - }, { - # Query result is embedded in webpage, but explicit request to video API fails with geo restriction - 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', - 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', - 'info_dict': { - 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', - 'ext': 'mp4', - 'title': 'Communitary - Community Episode 1: Ladders', - 'description': 'md5:8fc39608213295748e1e289807838c97', - 'duration': 1646, - }, - }, { - # it uses an alias to get the video_id - 'url': 'https://www.yahoo.com/movies/the-stars-of-daddys-home-have-very-different-212843197.html', - 'info_dict': { - 'id': '40eda9c8-8e5f-3552-8745-830f67d0c737', - 'ext': 'mp4', - 'title': 'Will Ferrell & Mark Wahlberg Are Pro-Spanking', - 'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.', - }, - }, - { - # config['models']['applet_model']['data']['sapi'] has no query - 'url': 'https://www.yahoo.com/music/livenation/event/galactic-2016', - 'md5': 'dac0c72d502bc5facda80c9e6d5c98db', - 'info_dict': { - 'id': 'a6015640-e9e5-3efb-bb60-05589a183919', - 'ext': 'mp4', - 'description': 'Galactic', - 'title': 'Dolla Diva (feat. Maggie Koerner)', - }, - 'skip': 'redirect to https://www.yahoo.com/music', - }, - { - # yahoo://article/ - 'url': 'https://www.yahoo.com/movies/video/true-story-trailer-173000497.html', - 'info_dict': { - 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', - 'ext': 'mp4', - 'title': "'True Story' Trailer", - 'description': 'True Story', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # ytwnews://cavideo/ - 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', - 'info_dict': { - 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', - 'ext': 'mp4', - 'title': '單車天使 - 中文版預', - 'description': '中文版預', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # custom brightcove - 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37083565/clown-entertainers-say-it-is-hurting-their-business/', - 'info_dict': { - 'id': '5575377707001', - 'ext': 'mp4', - 'title': "Clown entertainers say 'It' is hurting their business", - 'description': 'Stephen King s horror film has much to answer for. Jelby and Mr Loopy the Clowns join us.', - 'timestamp': 1505341164, - 'upload_date': '20170913', - 'uploader_id': '2376984109001', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # custom brightcove, geo-restricted to Australia, bypassable - 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37263964/sunrise-episode-wed-27-sep/', - 'only_matching': True, + }, { + 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', + 'md5': '71298482f7c64cbb7fa064e4553ff1c1', + 'info_dict': { + 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', + 'ext': 'webm', + 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', + 'description': 'md5:f66c890e1490f4910a9953c941dee944', + 'duration': 97, + 'timestamp': 1414489862, + 'upload_date': '20141028', } - ] + }, { + 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', + 'md5': '88e209b417f173d86186bef6e4d1f160', + 'info_dict': { + 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', + 'ext': 'mp4', + 'title': 'China Moses Is Crazy About the Blues', + 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', + 'duration': 128, + 'timestamp': 1385722202, + 'upload_date': '20131129', + } + }, { + 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', + 'md5': '2a9752f74cb898af5d1083ea9f661b58', + 'info_dict': { + 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', + 'ext': 'mp4', + 'title': '\'True Story\' Trailer', + 'description': 'True Story', + 'duration': 150, + 'timestamp': 1418919206, + 'upload_date': '20141218', + }, + }, { + 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', + 'only_matching': True, + }, { + 'note': 'NBC Sports embeds', + 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + 'upload_date': '20150313', + 'uploader': 'NBCU-SPORTS', + 'timestamp': 1426270238, + }, + }, { + 'url': 'https://tw.news.yahoo.com/-100120367.html', + 'only_matching': True, + }, { + # Query result is embedded in webpage, but explicit request to video API fails with geo restriction + 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', + 'info_dict': { + 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', + 'ext': 'mp4', + 'title': 'Communitary - Community Episode 1: Ladders', + 'description': 'md5:8fc39608213295748e1e289807838c97', + 'duration': 1646, + 'timestamp': 1440436550, + 'upload_date': '20150824', + 'series': 'Communitary', + 'season_number': 6, + 'episode_number': 1, + }, + }, { + # ytwnews://cavideo/ + 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', + 'info_dict': { + 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', + 'ext': 'mp4', + 'title': '單車天使 - 中文版預', + 'description': '中文版預', + 'timestamp': 1476696196, + 'upload_date': '20161017', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Contains both a Yahoo hosted video and multiple Youtube embeds + 'url': 'https://www.yahoo.com/entertainment/gwen-stefani-reveals-the-pop-hit-she-passed-on-assigns-it-to-her-voice-contestant-instead-033045672.html', + 'info_dict': { + 'id': '46c5d95a-528f-3d03-b732-732fcadd51de', + 'title': 'Gwen Stefani reveals the pop hit she passed on, assigns it to her \'Voice\' contestant instead', + 'description': 'Gwen decided not to record this hit herself, but she decided it was the perfect fit for Kyndall Inskeep.', + }, + 'playlist': [{ + 'info_dict': { + 'id': '966d4262-4fd1-3aaa-b45b-049ca6e38ba6', + 'ext': 'mp4', + 'title': 'Gwen Stefani reveals she turned down one of Sia\'s best songs', + 'description': 'On "The Voice" Tuesday, Gwen Stefani told Taylor Swift which Sia hit was almost hers.', + 'timestamp': 1572406500, + 'upload_date': '20191030', + }, + }, { + 'info_dict': { + 'id': '352CFDOQrKg', + 'ext': 'mp4', + 'title': 'Kyndal Inskeep "Performs the Hell Out of" Sia\'s "Elastic Heart" - The Voice Knockouts 2019', + 'description': 'md5:35b61e94c2ae214bc965ff4245f80d11', + 'uploader': 'The Voice', + 'uploader_id': 'NBCTheVoice', + 'upload_date': '20191029', + }, + }], + 'params': { + 'playlistend': 2, + }, + 'expected_warnings': ['HTTP Error 404'], + }, { + 'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html', + 'only_matching': True, + }, { + 'url': 'https://es-us.noticias.yahoo.com/es-la-puerta-irrompible-que-110539379.html', + 'only_matching': True, + }, { + 'url': 'https://www.yahoo.com/entertainment/v/longtime-cbs-news-60-minutes-032036500-cbs.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - display_id = mobj.group('display_id') or page_id - host = mobj.group('host') - webpage, urlh = self._download_webpage_handle(url, display_id) - if 'err=404' in urlh.geturl(): - raise ExtractorError('Video gone', expected=True) - - # Look for iframed media first - entries = [] - iframe_urls = re.findall(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) - for idx, iframe_url in enumerate(iframe_urls): - entries.append(self.url_result(host + iframe_url, 'Yahoo')) - if entries: - return self.playlist_result(entries, page_id) - - # Look for NBCSports iframes - nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) - if nbc_sports_url: - return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) - - # Look for Brightcove Legacy Studio embeds - bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - if bc_url: - return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) - - def brightcove_url_result(bc_url): - return self.url_result( - smuggle_url(bc_url, {'geo_countries': [mobj.group('country')]}), - BrightcoveNewIE.ie_key()) - - # Look for Brightcove New Studio embeds - bc_url = BrightcoveNewIE._extract_url(self, webpage) - if bc_url: - return brightcove_url_result(bc_url) - - brightcove_iframe = self._search_regex( - r'(<iframe[^>]+data-video-id=["\']\d+[^>]+>)', webpage, - 'brightcove iframe', default=None) - if brightcove_iframe: - attr = extract_attributes(brightcove_iframe) - src = attr.get('src') - if src: - parsed_src = compat_urlparse.urlparse(src) - qs = compat_urlparse.parse_qs(parsed_src.query) - account_id = qs.get('accountId', ['2376984109001'])[0] - brightcove_id = attr.get('data-video-id') or qs.get('videoId', [None])[0] - if account_id and brightcove_id: - return brightcove_url_result( - 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - % (account_id, brightcove_id)) - - # Query result is often embedded in webpage as JSON. Sometimes explicit requests - # to video API results in a failure with geo restriction reason therefore using - # embedded query result when present sounds reasonable. - config_json = self._search_regex( - r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:</script>|$)', - webpage, 'videoplayer applet', default=None) - if config_json: - config = self._parse_json(config_json, display_id, fatal=False) - if config: - sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') - if sapi and 'query' in sapi: - info = self._extract_info(display_id, sapi, webpage) - self._sort_formats(info['formats']) - return info - - items_json = self._search_regex( - r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, - default=None) - if items_json is None: - alias = self._search_regex( - r'"aliases":{"video":"(.*?)"', webpage, 'alias', default=None) - if alias is not None: - alias_info = self._download_json( - 'https://www.yahoo.com/_td/api/resource/VideoService.videos;video_aliases=["%s"]' % alias, - display_id, 'Downloading alias info') - video_id = alias_info[0]['id'] - else: - CONTENT_ID_REGEXES = [ - r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"', - r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"', - r'"first_videoid"\s*:\s*"([^"]+)"', - r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id), - r'<article[^>]data-uuid=["\']([^"\']+)', - r'<meta[^<>]+yahoo://article/view\?.*\buuid=([^&"\']+)', - r'<meta[^<>]+["\']ytwnews://cavideo/(?:[^/]+/)+([\da-fA-F-]+)[&"\']', - ] - video_id = self._search_regex( - CONTENT_ID_REGEXES, webpage, 'content ID') + url, country, display_id = re.match(self._VALID_URL, url).groups() + if not country: + country = 'us' else: - items = json.loads(items_json) - info = items['mediaItems']['query']['results']['mediaObj'][0] - # The 'meta' field is not always in the video webpage, we request it - # from another page - video_id = info['id'] - return self._get_info(video_id, display_id, webpage) + country = country.split('-')[0] + api_base = 'https://%s.yahoo.com/_td/api/resource/' % country - def _extract_info(self, display_id, query, webpage): - info = query['query']['results']['mediaObj'][0] - meta = info.get('meta') - video_id = info.get('id') + for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]): + content = self._download_json( + api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid, + display_id, 'Downloading content JSON metadata', fatal=i == 1) + if content: + item = content['items'][0] + break - if not meta: - msg = info['status'].get('msg') - if msg: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, msg), expected=True) - raise ExtractorError('Unable to extract media object meta') + if item.get('type') != 'video': + entries = [] + cover = item.get('cover') or {} + if cover.get('type') == 'yvideo': + cover_url = cover.get('url') + if cover_url: + entries.append(self.url_result( + cover_url, 'Yahoo', cover.get('uuid'))) + + for e in item.get('body', []): + if e.get('type') == 'videoIframe': + iframe_url = e.get('url') + if not iframe_url: + continue + entries.append(self.url_result(iframe_url)) + + return self.playlist_result( + entries, item.get('uuid'), + item.get('title'), item.get('summary')) + + video_id = item['uuid'] + video = self._download_json( + api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id, + video_id, 'Downloading video JSON metadata')[0] + title = video['title'] + + if country == 'malaysia': + country = 'my' + + is_live = video.get('live_state') == 'live' + fmts = ('m3u8',) if is_live else ('webm', 'mp4') + + urls = [] formats = [] - for s in info['streams']: - tbr = int_or_none(s.get('bitrate')) - format_info = { - 'width': int_or_none(s.get('width')), - 'height': int_or_none(s.get('height')), - 'tbr': tbr, - } - - host = s['host'] - path = s['path'] - if host.startswith('rtmp'): - fmt = 'rtmp' - format_info.update({ - 'url': host, - 'play_path': path, - 'ext': 'flv', - }) - else: - if s.get('format') == 'm3u8_playlist': - fmt = 'hls' - format_info.update({ - 'protocol': 'm3u8_native', - 'ext': 'mp4', - }) - else: - fmt = format_info['ext'] = determine_ext(path) - format_url = compat_urlparse.urljoin(host, path) - format_info['url'] = format_url - format_info['format_id'] = fmt + ('-%d' % tbr if tbr else '') - formats.append(format_info) - - closed_captions = self._html_search_regex( - r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', - default='[]') - - cc_json = self._parse_json(closed_captions, video_id, fatal=False) subtitles = {} - if cc_json: - for closed_caption in cc_json: - lang = closed_caption['lang'] - if lang not in subtitles: - subtitles[lang] = [] - subtitles[lang].append({ - 'url': closed_caption['url'], - 'ext': mimetype2ext(closed_caption['content_type']), + for fmt in fmts: + media_obj = self._download_json( + 'https://video-api.yql.yahoo.com/v1/video/sapi/streams/' + video_id, + video_id, 'Downloading %s JSON metadata' % fmt, + headers=self.geo_verification_headers(), query={ + 'format': fmt, + 'region': country.upper(), + })['query']['results']['mediaObj'][0] + msg = media_obj.get('status', {}).get('msg') + + for s in media_obj.get('streams', []): + host = s.get('host') + path = s.get('path') + if not host or not path: + continue + s_url = host + path + if s.get('format') == 'm3u8': + formats.extend(self._extract_m3u8_formats( + s_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + continue + tbr = int_or_none(s.get('bitrate')) + formats.append({ + 'url': s_url, + 'format_id': fmt + ('-%d' % tbr if tbr else ''), + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'tbr': tbr, + 'fps': int_or_none(s.get('framerate')), }) + for cc in media_obj.get('closedcaptions', []): + cc_url = cc.get('url') + if not cc_url or cc_url in urls: + continue + urls.append(cc_url) + subtitles.setdefault(cc.get('lang') or 'en-US', []).append({ + 'url': cc_url, + 'ext': mimetype2ext(cc.get('content_type')), + }) + + streaming_url = video.get('streaming_url') + if streaming_url and not is_live: + formats.extend(self._extract_m3u8_formats( + streaming_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + if not formats and msg == 'geo restricted': + self.raise_geo_restricted() + + self._sort_formats(formats) + + thumbnails = [] + for thumb in video.get('thumbnails', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'id': thumb.get('tag'), + 'url': thumb.get('url'), + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + series_info = video.get('series_info') or {} + return { 'id': video_id, - 'display_id': display_id, - 'title': unescapeHTML(meta['title']), + 'title': self._live_title(title) if is_live else title, 'formats': formats, - 'description': clean_html(meta['description']), - 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage), - 'duration': int_or_none(meta.get('duration')), + 'display_id': display_id, + 'thumbnails': thumbnails, + 'description': clean_html(video.get('description')), + 'timestamp': parse_iso8601(video.get('publish_time')), 'subtitles': subtitles, + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('view_count')), + 'is_live': is_live, + 'series': video.get('show_name'), + 'season_number': int_or_none(series_info.get('season_number')), + 'episode_number': int_or_none(series_info.get('episode_number')), } - def _get_info(self, video_id, display_id, webpage): - region = self._search_regex( - r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US').upper() - formats = [] - info = {} - for fmt in ('webm', 'mp4'): - query_result = self._download_json( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + video_id, - display_id, 'Downloading %s video info' % fmt, query={ - 'protocol': 'http', - 'region': region, - 'format': fmt, - }) - info = self._extract_info(display_id, query_result, webpage) - formats.extend(info['formats']) - formats.extend(self._extract_m3u8_formats( - 'http://video.media.yql.yahoo.com/v1/hls/%s?region=%s' % (video_id, region), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - info['formats'] = formats - return info - class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' @@ -523,7 +383,7 @@ class YahooGyaOPlayerIE(InfoExtractor): 'id': video_id, 'title': video['title'], 'url': smuggle_url( - 'http://players.brightcove.net/4235717419001/default_default/index.html?videoId=' + video['videoId'], + 'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video['videoId'], {'geo_countries': ['JP']}), 'description': video.get('longDescription'), 'ie_key': BrightcoveNewIE.ie_key(), diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 25d056b3c..b913d07a6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -41,7 +41,6 @@ from ..utils import ( orderedSet, parse_codecs, parse_duration, - qualities, remove_quotes, remove_start, smuggle_url, @@ -70,7 +69,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' def _set_language(self): self._set_cookie( @@ -373,7 +372,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _VALID_URL = r"""(?x)^ ( (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| + (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| (?:www\.)?hooktube\.com/| @@ -384,13 +383,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:(?:www|no)\.)?invidiou\.sh/| (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| - (?:www\.)?invidious\.enkirton\.net/| (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| (?:www\.)?invidious\.nixnet\.xyz/| + (?:www\.)?invidious\.drycat\.fr/| (?:www\.)?tube\.poal\.co/| (?:www\.)?vid\.wxzm\.sx/| (?:www\.)?yt\.elukerio\.org/| + (?:www\.)?yt\.lelux\.fi/| + (?:www\.)?kgg2m7yk5aybusll\.onion/| + (?:www\.)?qklhadlycap4cnod\.onion/| + (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| + (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/| + (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| + (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| + (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -1217,6 +1224,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', + 'only_matching': True, + }, ] def __init__(self, *args, **kwargs): @@ -1909,6 +1920,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return int_or_none(self._search_regex( r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) + streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or [] + streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or []) + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() formats = [{ @@ -1917,10 +1931,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': video_info['conn'][0], 'player_url': player_url, }] - elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): + elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) + formats = [] formats_spec = {} fmt_list = video_info.get('fmt_list', [''])[0] if fmt_list: @@ -1934,91 +1949,104 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': int_or_none(width_height[0]), 'height': int_or_none(width_height[1]), } - q = qualities(['small', 'medium', 'hd720']) - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) - if streaming_formats: - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: - continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - 'quality': q(quality), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - formats = [] - for url_data_str in encoded_url_map.split(','): - url_data = compat_parse_qs(url_data_str) - if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'): + for fmt in streaming_formats: + itag = str_or_none(fmt.get('itag')) + if not itag: continue + quality = fmt.get('quality') + quality_label = fmt.get('qualityLabel') or quality + formats_spec[itag] = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_note': quality_label, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + # bitrate for itag 43 is always 2147483647 + 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, + 'width': int_or_none(fmt.get('width')), + } + + for fmt in streaming_formats: + if fmt.get('drm_families'): + continue + url = url_or_none(fmt.get('url')) + + if not url: + cipher = fmt.get('cipher') + if not cipher: + continue + url_data = compat_parse_qs(cipher) + url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) + if not url: + continue + else: + cipher = None + url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) # Unsupported FORMAT_STREAM_TYPE_OTF if stream_type == 3: continue - format_id = url_data['itag'][0] - url = url_data['url'][0] - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') + format_id = fmt.get('itag') or url_data['itag'][0] + if not format_id: + continue + format_id = compat_str(format_id) + + if cipher: + if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): + ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') + ASSETS_RE, + embed_webpage if age_gate else video_webpage, + 'JS player URL (1)', default=None) + if not jsplayer_url_json and not age_gate: + # We need the embed website after all + if embed_webpage is None: + embed_url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed webpage') + jsplayer_url_json = self._search_regex( + ASSETS_RE, embed_webpage, 'JS player URL') - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): + player_url = json.loads(jsplayer_url_json) if player_url is None: - player_version = 'unknown' - player_desc = 'unknown' - else: - if player_url.endswith('swf'): - player_version = self._search_regex( - r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - 'flash player', fatal=False) - player_desc = 'flash player %s' % player_version + player_url_json = self._search_regex( + r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', + video_webpage, 'age gate player URL') + player_url = json.loads(player_url_json) + + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + elif 's' in url_data: + encrypted_sig = url_data['s'][0] + + if self._downloader.params.get('verbose'): + if player_url is None: + player_version = 'unknown' + player_desc = 'unknown' else: - player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], - player_url, - 'html5 player', fatal=False) - player_desc = 'html5 player %s' % player_version + if player_url.endswith('swf'): + player_version = self._search_regex( + r'-(.+?)(?:/watch_as3)?\.swf$', player_url, + 'flash player', fatal=False) + player_desc = 'flash player %s' % player_version + else: + player_version = self._search_regex( + [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], + player_url, + 'html5 player', fatal=False) + player_desc = 'html5 player %s' % player_version - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) + parts_sizes = self._signature_cache_id(encrypted_sig) + self.to_screen('{%s} signature length %s, %s' % + (format_id, parts_sizes, player_desc)) - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) + signature = self._decrypt_signature( + encrypted_sig, video_id, player_url, age_gate) + sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' + url += '&%s=%s' % (sp, signature) if 'ratebypass' not in url: url += '&ratebypass=yes' @@ -2038,24 +2066,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor): mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + if width is None: + width = int_or_none(fmt.get('width')) + if height is None: + height = int_or_none(fmt.get('height')) + filesize = int_or_none(url_data.get( 'clen', [None])[0]) or _extract_filesize(url) - quality = url_data.get('quality', [None])[0] + quality = url_data.get('quality', [None])[0] or fmt.get('quality') + quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') + + tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) + or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None + fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) more_fields = { 'filesize': filesize, - 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), + 'tbr': tbr, 'width': width, 'height': height, - 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or quality, - 'quality': q(quality), + 'fps': fps, + 'format_note': quality_label or quality, } for key, value in more_fields.items(): if value: dct[key] = value - type_ = url_data.get('type', [None])[0] + type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') if type_: type_split = type_.split(';') kind_ext = type_split[0].split('/') @@ -2432,7 +2469,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): (?:\w+\.)? (?: (?: - youtube\.com| + youtube(?:kids)?\.com| invidio\.us ) / @@ -2444,7 +2481,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) @@ -2614,6 +2651,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, }] def _real_initialize(self): @@ -2709,7 +2749,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): page, 'title', default=None) _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' - uploader = self._search_regex( + uploader = self._html_search_regex( r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, page, 'uploader', default=None) mobj = re.search( @@ -2784,7 +2824,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' + _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' @@ -2812,6 +2852,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', + 'only_matching': True, }] @classmethod diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index afa3f6c47..145c123a4 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -41,6 +41,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?]+)\.html' _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') + _GEO_COUNTRIES = ['DE'] _TESTS = [{ 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 70416c25e..fd3f921a8 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -393,7 +393,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): sub_ext = sub_info['ext'] if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': sub_langs.append(lang) - sub_filenames.append(subtitles_filename(filename, lang, sub_ext)) + sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext)) else: if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': webm_vtt_warn = True @@ -606,9 +606,9 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): self._downloader.to_screen( '[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext) continue - old_file = subtitles_filename(filename, lang, ext) + old_file = subtitles_filename(filename, lang, ext, info.get('ext')) sub_filenames.append(old_file) - new_file = subtitles_filename(filename, lang, new_ext) + new_file = subtitles_filename(filename, lang, new_ext, info.get('ext')) if ext in ('dfxp', 'ttml', 'tt'): self._downloader.report_warning( @@ -616,7 +616,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): 'which results in style information loss') dfxp_file = old_file - srt_file = subtitles_filename(filename, lang, 'srt') + srt_file = subtitles_filename(filename, lang, 'srt', info.get('ext')) with open(dfxp_file, 'rb') as f: srt_data = dfxp2srt(f.read()) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 798757241..f6204692a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -46,6 +46,7 @@ from .compat import ( compat_html_entities, compat_html_entities_html5, compat_http_client, + compat_integer_types, compat_kwargs, compat_os_name, compat_parse_qs, @@ -1718,13 +1719,16 @@ DATE_FORMATS = ( '%B %d %Y', '%B %dst %Y', '%B %dnd %Y', + '%B %drd %Y', '%B %dth %Y', '%b %d %Y', '%b %dst %Y', '%b %dnd %Y', + '%b %drd %Y', '%b %dth %Y', '%b %dst %Y %I:%M', '%b %dnd %Y %I:%M', + '%b %drd %Y %I:%M', '%b %dth %Y %I:%M', '%Y %m %d', '%Y-%m-%d', @@ -2906,8 +2910,8 @@ def determine_ext(url, default_ext='unknown_video'): return default_ext -def subtitles_filename(filename, sub_lang, sub_format): - return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format +def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): + return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext) def date_from_str(date_str): @@ -3516,10 +3520,11 @@ def str_or_none(v, default=None): def str_to_int(int_str): """ A more relaxed version of int_or_none """ - if int_str is None: - return None - int_str = re.sub(r'[,\.\+]', '', int_str) - return int(int_str) + if isinstance(int_str, compat_integer_types): + return int_str + elif isinstance(int_str, compat_str): + int_str = re.sub(r'[,\.\+]', '', int_str) + return int_or_none(int_str) def float_or_none(v, scale=1, invscale=1, default=None): @@ -4979,7 +4984,7 @@ class ISO3166Utils(object): class GeoUtils(object): # Major IPv4 address blocks per country _country_ip_map = { - 'AD': '85.94.160.0/19', + 'AD': '46.172.224.0/19', 'AE': '94.200.0.0/13', 'AF': '149.54.0.0/17', 'AG': '209.59.64.0/18', @@ -4987,28 +4992,30 @@ class GeoUtils(object): 'AL': '46.99.0.0/16', 'AM': '46.70.0.0/15', 'AO': '105.168.0.0/13', - 'AP': '159.117.192.0/21', + 'AP': '182.50.184.0/21', + 'AQ': '23.154.160.0/24', 'AR': '181.0.0.0/12', 'AS': '202.70.112.0/20', - 'AT': '84.112.0.0/13', + 'AT': '77.116.0.0/14', 'AU': '1.128.0.0/11', 'AW': '181.41.0.0/18', - 'AZ': '5.191.0.0/16', + 'AX': '185.217.4.0/22', + 'AZ': '5.197.0.0/16', 'BA': '31.176.128.0/17', 'BB': '65.48.128.0/17', 'BD': '114.130.0.0/16', 'BE': '57.0.0.0/8', - 'BF': '129.45.128.0/17', + 'BF': '102.178.0.0/15', 'BG': '95.42.0.0/15', 'BH': '37.131.0.0/17', 'BI': '154.117.192.0/18', 'BJ': '137.255.0.0/16', - 'BL': '192.131.134.0/24', + 'BL': '185.212.72.0/23', 'BM': '196.12.64.0/18', 'BN': '156.31.0.0/16', 'BO': '161.56.0.0/16', 'BQ': '161.0.80.0/20', - 'BR': '152.240.0.0/12', + 'BR': '191.128.0.0/12', 'BS': '24.51.64.0/18', 'BT': '119.2.96.0/19', 'BW': '168.167.0.0/16', @@ -5016,20 +5023,20 @@ class GeoUtils(object): 'BZ': '179.42.192.0/18', 'CA': '99.224.0.0/11', 'CD': '41.243.0.0/16', - 'CF': '196.32.200.0/21', - 'CG': '197.214.128.0/17', + 'CF': '197.242.176.0/21', + 'CG': '160.113.0.0/16', 'CH': '85.0.0.0/13', - 'CI': '154.232.0.0/14', + 'CI': '102.136.0.0/14', 'CK': '202.65.32.0/19', 'CL': '152.172.0.0/14', - 'CM': '165.210.0.0/15', + 'CM': '102.244.0.0/14', 'CN': '36.128.0.0/10', 'CO': '181.240.0.0/12', 'CR': '201.192.0.0/12', 'CU': '152.206.0.0/15', 'CV': '165.90.96.0/19', 'CW': '190.88.128.0/17', - 'CY': '46.198.0.0/15', + 'CY': '31.153.0.0/16', 'CZ': '88.100.0.0/14', 'DE': '53.0.0.0/8', 'DJ': '197.241.0.0/17', @@ -5046,6 +5053,7 @@ class GeoUtils(object): 'EU': '2.16.0.0/13', 'FI': '91.152.0.0/13', 'FJ': '144.120.0.0/16', + 'FK': '80.73.208.0/21', 'FM': '119.252.112.0/20', 'FO': '88.85.32.0/19', 'FR': '90.0.0.0/9', @@ -5055,8 +5063,8 @@ class GeoUtils(object): 'GE': '31.146.0.0/16', 'GF': '161.22.64.0/18', 'GG': '62.68.160.0/19', - 'GH': '45.208.0.0/14', - 'GI': '85.115.128.0/19', + 'GH': '154.160.0.0/12', + 'GI': '95.164.0.0/16', 'GL': '88.83.0.0/19', 'GM': '160.182.0.0/15', 'GN': '197.149.192.0/18', @@ -5085,13 +5093,13 @@ class GeoUtils(object): 'JE': '87.244.64.0/18', 'JM': '72.27.0.0/17', 'JO': '176.29.0.0/16', - 'JP': '126.0.0.0/8', + 'JP': '133.0.0.0/8', 'KE': '105.48.0.0/12', 'KG': '158.181.128.0/17', 'KH': '36.37.128.0/17', 'KI': '103.25.140.0/22', 'KM': '197.255.224.0/20', - 'KN': '198.32.32.0/19', + 'KN': '198.167.192.0/19', 'KP': '175.45.176.0/22', 'KR': '175.192.0.0/10', 'KW': '37.36.0.0/14', @@ -5099,10 +5107,10 @@ class GeoUtils(object): 'KZ': '2.72.0.0/13', 'LA': '115.84.64.0/18', 'LB': '178.135.0.0/16', - 'LC': '192.147.231.0/24', + 'LC': '24.92.144.0/20', 'LI': '82.117.0.0/19', 'LK': '112.134.0.0/15', - 'LR': '41.86.0.0/19', + 'LR': '102.183.0.0/16', 'LS': '129.232.0.0/17', 'LT': '78.56.0.0/13', 'LU': '188.42.0.0/16', @@ -5127,7 +5135,7 @@ class GeoUtils(object): 'MT': '46.11.0.0/16', 'MU': '105.16.0.0/12', 'MV': '27.114.128.0/18', - 'MW': '105.234.0.0/16', + 'MW': '102.70.0.0/15', 'MX': '187.192.0.0/11', 'MY': '175.136.0.0/13', 'MZ': '197.218.0.0/15', @@ -5158,23 +5166,23 @@ class GeoUtils(object): 'PW': '202.124.224.0/20', 'PY': '181.120.0.0/14', 'QA': '37.210.0.0/15', - 'RE': '139.26.0.0/16', + 'RE': '102.35.0.0/16', 'RO': '79.112.0.0/13', - 'RS': '178.220.0.0/14', + 'RS': '93.86.0.0/15', 'RU': '5.136.0.0/13', - 'RW': '105.178.0.0/15', + 'RW': '41.186.0.0/16', 'SA': '188.48.0.0/13', 'SB': '202.1.160.0/19', 'SC': '154.192.0.0/11', - 'SD': '154.96.0.0/13', + 'SD': '102.120.0.0/13', 'SE': '78.64.0.0/12', - 'SG': '152.56.0.0/14', + 'SG': '8.128.0.0/10', 'SI': '188.196.0.0/14', 'SK': '78.98.0.0/15', - 'SL': '197.215.0.0/17', + 'SL': '102.143.0.0/17', 'SM': '89.186.32.0/19', 'SN': '41.82.0.0/15', - 'SO': '197.220.64.0/19', + 'SO': '154.115.192.0/18', 'SR': '186.179.128.0/17', 'SS': '105.235.208.0/21', 'ST': '197.159.160.0/19', @@ -5197,15 +5205,15 @@ class GeoUtils(object): 'TV': '202.2.96.0/19', 'TW': '120.96.0.0/11', 'TZ': '156.156.0.0/14', - 'UA': '93.72.0.0/13', - 'UG': '154.224.0.0/13', - 'US': '3.0.0.0/8', + 'UA': '37.52.0.0/14', + 'UG': '102.80.0.0/13', + 'US': '6.0.0.0/8', 'UY': '167.56.0.0/13', - 'UZ': '82.215.64.0/18', + 'UZ': '84.54.64.0/18', 'VA': '212.77.0.0/19', - 'VC': '24.92.144.0/20', + 'VC': '207.191.240.0/21', 'VE': '186.88.0.0/13', - 'VG': '172.103.64.0/18', + 'VG': '66.81.192.0/20', 'VI': '146.226.0.0/16', 'VN': '14.160.0.0/11', 'VU': '202.80.32.0/20', @@ -5214,8 +5222,8 @@ class GeoUtils(object): 'YE': '134.35.0.0/16', 'YT': '41.242.116.0/22', 'ZA': '41.0.0.0/11', - 'ZM': '165.56.0.0/13', - 'ZW': '41.85.192.0/19', + 'ZM': '102.144.0.0/13', + 'ZW': '102.177.192.0/18', } @classmethod @@ -5377,6 +5385,19 @@ def decode_packed_codes(code): obfucasted_code) +def caesar(s, alphabet, shift): + if shift == 0: + return s + l = len(alphabet) + return ''.join( + alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c + for c in s) + + +def rot47(s): + return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47) + + def parse_m3u8_attributes(attrib): info = {} for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 98fa32286..8ad2df674 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.09.01' +__version__ = '2020.01.01'