diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index b2bfa9ec5..20433e915 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.09** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.08.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.08.04** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.09 +[debug] youtube-dl version 2018.08.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.gitignore b/.gitignore index fbf7cecb2..c4870a6ba 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,7 @@ youtube-dl.zsh *.iml tmp/ +venv/ + +# VS Code related files +.vscode diff --git a/AUTHORS b/AUTHORS index eaf96d79d..b507cb8df 100644 --- a/AUTHORS +++ b/AUTHORS @@ -239,3 +239,10 @@ Martin Weinelt Surya Oktafendri TingPing Alexandre Macabies +Bastian de Groot +Niklas Haas +András Veres-Szentkirályi +Enes Solak +Nathan Rossi +Thomas van der Berg +Luca Cherubin diff --git a/ChangeLog b/ChangeLog index ef6cc3850..665503827 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,283 @@ +version 2018.08.04 + +Extractors +* [funk:channel] Improve byChannelAlias extraction (#17142) +* [twitch] Fix authentication (#17024, #17126) +* [twitch:vod] Improve URL regular expression (#17135) +* [watchbox] Fix extraction (#17107) +* [pbs] Fix extraction (#17109) +* [theplatform] Relax URL regular expression (#16181, #17097) ++ [viqeo] Add support for viqeo.tv (#17066) + + +version 2018.07.29 + +Extractors +* [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076) ++ [pornhub] Add support for subtitles (#16924, #17088) +* [ceskatelevize] Use https for API call (#16997, #16999) +* [dailymotion:playlist] Fix extraction (#16894) +* [ted] Improve extraction +* [ted] Fix extraction for videos without nativeDownloads (#16756, #17085) +* [telecinco] Fix extraction (#17080) +* [mitele] Reduce number of requests +* [rai] Return non HTTP relinker URL intact (#17055) +* [vk] Fix extraction for inline only videos (#16923) +* [streamcloud] Fix extraction (#17054) +* [facebook] Fix tahoe player extraction with authentication (#16655) ++ [puhutv] Add support for puhutv.com (#12712, #16010, #16269) + + +version 2018.07.21 + +Core ++ [utils] Introduce url_or_none +* [utils] Allow JSONP without function name (#17028) ++ [extractor/common] Extract DASH and MSS formats from SMIL manifests + +Extractors ++ [bbc] Add support for BBC Radio Play pages (#17022) +* [iwara] Fix download URLs (#17026) +* [vrtnu] Relax title extraction and extract JSON-LD (#17018) ++ [viu] Pass Referer and Origin headers and area id (#16992) ++ [vimeo] Add another config regular expression (#17013) ++ [facebook] Extract view count (#16942) +* [dailymotion] Improve description extraction (#16984) +* [slutload] Fix and improve extraction (#17001) +* [mediaset] Fix extraction (#16977) ++ [theplatform] Add support for theplatform TLD customization (#16977) +* [imgur] Relax URL regular expression (#16987) +* [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262, + #16959) + + +version 2018.07.10 + +Core +* [utils] Share JSON-LD regular expression +* [downloader/dash] Improve error handling (#16927) + +Extractors ++ [nrktv] Add support for new season and serie URL schema ++ [nrktv] Add support for new episode URL schema (#16909) ++ [frontendmasters] Add support for frontendmasters.com (#3661, #16328) +* [funk] Fix extraction (#16918) +* [watchbox] Fix extraction (#16904) +* [dplayit] Sort formats +* [dplayit] Fix extraction (#16901) +* [youtube] Improve login error handling (#13822) + + +version 2018.07.04 + +Core +* [extractor/common] Properly escape % in MPD templates (#16867) +* [extractor/common] Use source URL as Referer for HTML5 entries (16849) +* Prefer ffmpeg over avconv by default (#8622) + +Extractors +* [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899) +* [lynda] Simplify login and improve error capturing (#16891) ++ [go90] Add support for embed URLs (#16873) +* [go90] Detect geo restriction error and pass geo verification headers + (#16874) +* [vlive] Fix live streams extraction (#16871) +* [npo] Fix typo (#16872) ++ [mediaset] Add support for new videos and extract all formats (#16568) +* [dctptv] Restore extraction based on REST API (#16850) +* [svt] Improve extraction and add support for pages (#16802) +* [porncom] Fix extraction (#16808) + + +version 2018.06.25 + +Extractors +* [joj] Relax URL regular expression (#16771) +* [brightcove] Workaround sonyliv DRM protected videos (#16807) +* [motherless] Fix extraction (#16786) +* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780) +- [foxnews:insider] Remove extractor (#15810) ++ [foxnews] Add support for iframe embeds (#15810, #16711) + + +version 2018.06.19 + +Core ++ [extractor/common] Introduce expected_status in _download_* methods + for convenient accept of HTTP requests failed with non 2xx status codes ++ [compat] Introduce compat_integer_types + +Extractors +* [peertube] Improve generic support (#16733) ++ [6play] Use geo verification headers +* [rtbf] Fix extraction for python 3.2 +* [vgtv] Improve HLS formats extraction ++ [vgtv] Add support for www.aftonbladet.se/tv URLs +* [bbccouk] Use expected_status +* [markiza] Expect 500 HTTP status code +* [tvnow] Try all clear manifest URLs (#15361) + + +version 2018.06.18 + +Core +* [downloader/rtmp] Fix downloading in verbose mode (#16736) + +Extractors ++ [markiza] Add support for markiza.sk (#16750) +* [wat] Try all supported adaptive URLs ++ [6play] Add support for rtlplay.be and extract hd usp formats ++ [rtbf] Add support for audio and live streams (#9638, #11923) ++ [rtbf] Extract HLS, DASH and all HTTP formats ++ [rtbf] Extract subtitles ++ [rtbf] Fixup specific HTTP URLs (#16101) ++ [expressen] Add support for expressen.se +* [vidzi] Fix extraction (#16678) +* [pbs] Improve extraction (#16623, #16684) +* [bilibili] Restrict cid regular expression (#16638, #16734) + + +version 2018.06.14 + +Core +* [downloader/http] Fix retry on error when streaming to stdout (#16699) + +Extractors ++ [discoverynetworks] Add support for disco-api videos (#16724) ++ [dailymotion] Add support for password protected videos (#9789) ++ [abc:iview] Add support for livestreams (#12354) +* [abc:iview] Fix extraction (#16704) ++ [crackle] Add support for sonycrackle.com (#16698) ++ [tvnet] Add support for tvnet.gov.vn (#15462) +* [nrk] Update API hosts and try all previously known ones (#16690) +* [wimp] Fix Youtube embeds extraction + + +version 2018.06.11 + +Extractors +* [npo] Extend URL regular expression and add support for npostart.nl (#16682) ++ [inc] Add support for another embed schema (#16666) +* [tv4] Fix format extraction (#16650) ++ [nexx] Add support for free cdn (#16538) ++ [pbs] Add another cove id pattern (#15373) ++ [rbmaradio] Add support for 192k format (#16631) + + +version 2018.06.04 + +Extractors ++ [camtube] Add support for camtube.co ++ [twitter:card] Extract guest token (#16609) ++ [chaturbate] Use geo verification headers ++ [bbc] Add support for bbcthree (#16612) +* [youtube] Move metadata extraction after video availability check ++ [youtube] Extract track and artist ++ [safari] Add support for new URL schema (#16614) +* [adn] Fix extraction + + +version 2018.06.02 + +Core +* [utils] Improve determine_ext + +Extractors ++ [facebook] Add support for tahoe player videos (#15441, #16554) +* [cbc] Improve extraction (#16583, #16593) +* [openload] Improve ext extraction (#16595) ++ [twitter:card] Add support for another endpoint (#16586) ++ [openload] Add support for oload.win and oload.download (#16592) +* [audimedia] Fix extraction (#15309) ++ [francetv] Add support for sport.francetvinfo.fr (#15645) +* [mlb] Improve extraction (#16587) +- [nhl] Remove old extractors +* [rbmaradio] Check formats availability (#16585) + + +version 2018.05.30 + +Core +* [downloader/rtmp] Generalize download messages and report time elapsed + on finish +* [downloader/rtmp] Gracefully handle live streams interrupted by user + +Extractors +* [teamcoco] Fix extraction for full episodes (#16573) +* [spiegel] Fix info extraction (#16538) ++ [apa] Add support for apa.at (#15041, #15672) ++ [bellmedia] Add support for bnnbloomberg.ca (#16560) ++ [9c9media] Extract MPD formats and subtitles +* [cammodels] Use geo verification headers ++ [ufctv] Add support for authentication (#16542) ++ [cammodels] Add support for cammodels.com (#14499) +* [utils] Fix style id extraction for namespaced id attribute in dfxp2srt + (#16551) +* [soundcloud] Detect format extension (#16549) +* [cbc] Fix playlist title extraction (#16502) ++ [tumblr] Detect and report sensitive media (#13829) ++ [tumblr] Add support for authentication (#15133) + + +version 2018.05.26 + +Core +* [utils] Improve parse_age_limit + +Extractors +* [audiomack] Stringify video id (#15310) +* [izlesene] Fix extraction (#16233, #16271, #16407) ++ [indavideo] Add support for generic embeds (#11989) +* [indavideo] Fix extraction (#11221) +* [indavideo] Sign download URLs (#16174) ++ [peertube] Add support for PeerTube based sites (#16301, #16329) +* [imgur] Fix extraction (#16537) ++ [hidive] Add support for authentication (#16534) ++ [nbc] Add support for stream.nbcsports.com (#13911) ++ [viewlift] Add support for hoichoi.tv (#16536) +* [go90] Extract age limit and detect DRM protection(#10127) +* [viewlift] fix extraction for snagfilms.com (#15766) +* [globo] Improve extraction (#4189) + * Add support for authentication + * Simplify URL signing + * Extract DASH and MSS formats +* [leeco] Fix extraction (#16464) +* [teamcoco] Add fallback for format extraction (#16484) +* [teamcoco] Improve URL regular expression (#16484) +* [imdb] Improve extraction (#4085, #14557) + + +version 2018.05.18 + +Extractors +* [vimeo:likes] Relax URL regular expression and fix single page likes + extraction (#16475) +* [pluralsight] Fix clip id extraction (#16460) ++ [mychannels] Add support for mychannels.com (#15334) +- [moniker] Remove extractor (#15336) +* [pbs] Fix embed data extraction (#16474) ++ [mtv] Add support for paramountnetwork.com and bellator.com (#15418) +* [youtube] Fix hd720 format position +* [dailymotion] Remove fragment part from m3u8 URLs (#8915) +* [3sat] Improve extraction (#15350) + * Extract all formats + * Extract more format metadata + * Improve format sorting + * Use hls native downloader + * Detect and bypass geo-restriction ++ [dtube] Add support for d.tube (#15201) +* [options] Fix typo (#16450) +* [youtube] Improve format filesize extraction (#16453) +* [youtube] Make uploader extraction non fatal (#16444) +* [youtube] Fix extraction for embed restricted live streams (#16433) +* [nbc] Improve info extraction (#16440) +* [twitch:clips] Fix extraction (#16429) +* [redditr] Relax URL regular expression (#16426, #16427) +* [mixcloud] Bypass throttling for HTTP formats (#12579, #16424) ++ [nick] Add support for nickjr.de (#13230) +* [teamcoco] Fix extraction (#16374) + + version 2018.05.09 Core diff --git a/README.md b/README.md index dbdea055f..e8ddd55b5 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Forked version that could work on android # INSTALLATION -To install it right away for all UNIX users (Linux, OS X, etc.), type: +To install it right away for all UNIX users (Linux, macOS, etc.), type: sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl @@ -36,7 +36,7 @@ You can also use pip: This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information. -OS X users can install youtube-dl with [Homebrew](https://brew.sh/): +macOS users can install youtube-dl with [Homebrew](https://brew.sh/): brew install youtube-dl @@ -94,8 +94,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. - To enable experimental SOCKS proxy, specify - a proper scheme. For example + To enable SOCKS proxy, specify a proper + scheme. For example socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") for direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds @@ -107,19 +107,18 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --geo-verification-proxy URL Use this proxy to verify the IP address for some geo-restricted sites. The default proxy specified by --proxy (or none, if the - options is not present) is used for the + option is not present) is used for the actual downloading. --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header (experimental) + X-Forwarded-For HTTP header --no-geo-bypass Do not bypass geographic restriction via faking X-Forwarded-For HTTP header - (experimental) --geo-bypass-country CODE Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 - country code (experimental) + country code --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with explicitly provided IP block in CIDR - notation (experimental) + notation ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) @@ -210,7 +209,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --playlist-reverse Download playlist videos in reverse order --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with - expected file size (experimental) + expected file size --hls-prefer-native Use the native HLS downloader instead of ffmpeg --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS @@ -429,9 +428,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo default; fix file if we can, warn otherwise) --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors (default) - --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors + --prefer-ffmpeg Prefer ffmpeg over avconv for running the + postprocessors (default) --ffmpeg-location PATH Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. @@ -444,7 +443,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo # CONFIGURATION -You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and macOS, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` @@ -872,7 +871,7 @@ Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the op Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. -In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox). Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. diff --git a/devscripts/gh-pages/update-copyright.py b/devscripts/gh-pages/update-copyright.py index e6c3abc8d..61487f925 100755 --- a/devscripts/gh-pages/update-copyright.py +++ b/devscripts/gh-pages/update-copyright.py @@ -13,7 +13,7 @@ year = str(datetime.datetime.now().year) for fn in glob.glob('*.html*'): with io.open(fn, encoding='utf-8') as f: content = f.read() - newc = re.sub(r'(?PCopyright © 2006-)(?P[0-9]{4})', 'Copyright © 2006-' + year, content) + newc = re.sub(r'(?PCopyright © 2011-)(?P[0-9]{4})', 'Copyright © 2011-' + year, content) if content != newc: tmpFn = fn + '.part' with io.open(tmpFn, 'wt', encoding='utf-8') as outf: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 88fac6e90..4bf2ec81b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -15,7 +15,6 @@ - **8tracks** - **91porn** - **9c9media** - - **9c9media:stack** - **9gag** - **9now.com.au** - **abc.net.au** @@ -48,6 +47,7 @@ - **anitube.se** - **Anvato** - **AnySex** + - **APA** - **Aparat** - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 @@ -100,6 +100,7 @@ - **Beatport** - **Beeg** - **BehindKink** + - **Bellator** - **BellMedia** - **Bet** - **Bigflix** @@ -127,6 +128,8 @@ - **BYUtv** - **Camdemy** - **CamdemyFolder** + - **CamModels** + - **CamTube** - **CamWithHer** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr @@ -234,6 +237,7 @@ - **DrTuber** - **drtv** - **drtv:live** + - **DTube** - **Dumpert** - **dvtv**: http://video.aktualne.cz/ - **dw** @@ -262,6 +266,7 @@ - **Europa** - **EveryonesMixtape** - **ExpoTV** + - **Expressen** - **ExtremeTube** - **EyedoTV** - **facebook** @@ -285,7 +290,6 @@ - **Foxgay** - **foxnews**: Fox News and Fox Business Video - **foxnews:article** - - **foxnews:insider** - **FoxSports** - **france2.fr:generation-what** - **FranceCulture** @@ -298,6 +302,9 @@ - **Freesound** - **freespeech.org** - **FreshLive** + - **FrontendMasters** + - **FrontendMastersCourse** + - **FrontendMastersLesson** - **Funimation** - **FunkChannel** - **FunkMix** @@ -363,7 +370,6 @@ - **ImgurAlbum** - **Ina** - **Inc** - - **Indavideo** - **IndavideoEmbed** - **InfoQ** - **Instagram** @@ -448,11 +454,12 @@ - **mailru**: Видео@Mail.Ru - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru - - **MakersChannel** - **MakerTV** - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **Markiza** + - **MarkizaPage** - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA @@ -486,7 +493,6 @@ - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - **Mojvideo** - - **Moniker**: allmyvideos.net and vidspot.net - **Morningstar**: morningstar.com - **Motherless** - **MotherlessGroup** @@ -508,6 +514,7 @@ - **mva:course**: Microsoft Virtual Academy courses - **Mwave** - **MwaveMeetGreet** + - **MyChannels** - **MySpace** - **MySpace:album** - **MySpass** @@ -525,6 +532,7 @@ - **nbcolympics** - **nbcolympics:stream** - **NBCSports** + - **NBCSportsStream** - **NBCSportsVPlayer** - **ndr**: NDR.de - Norddeutscher Rundfunk - **ndr:embed** @@ -551,9 +559,6 @@ - **nfl.com** - **NhkVod** - **nhl.com** - - **nhl.com:news**: NHL news - - **nhl.com:videocenter** - - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **nick.de** - **nickelodeon:br** @@ -587,7 +592,9 @@ - **NRKSkole**: NRK Skole - **NRKTV**: NRK TV and NRK Radio - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte + - **NRKTVEpisode** - **NRKTVEpisodes** + - **NRKTVSeason** - **NRKTVSeries** - **ntv.ru** - **Nuvid** @@ -618,11 +625,13 @@ - **PacktPubCourse** - **PandaTV**: 熊猫TV - **pandora.tv**: 판도라TV + - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pcmag** - **PearVideo** + - **PeerTube** - **People** - **PerformGroup** - **periscope**: Periscope @@ -663,6 +672,8 @@ - **PrimeShareTV** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital + - **puhutv** + - **puhutv:serie** - **Puls4** - **Pyvideo** - **qqmusic**: QQ音乐 @@ -789,7 +800,7 @@ - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de - **Spiegeltv** - - **Spike** + - **sport.francetvinfo.fr** - **Sport5** - **SportBoxEmbed** - **SportDeutschland** @@ -809,6 +820,7 @@ - **StretchInternet** - **SunPorno** - **SVT** + - **SVTPage** - **SVTPlay**: SVT Play and Öppet arkiv - **SVTSeries** - **SWRMediathek** @@ -891,6 +903,7 @@ - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** + - **TVNet** - **TVNoe** - **TVNow** - **TVNowList** @@ -988,6 +1001,7 @@ - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** + - **Viqeo** - **Viu** - **viu:ott** - **viu:playlist** diff --git a/setup.cfg b/setup.cfg index 5208f7ae2..af9a554c6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,5 @@ universal = True [flake8] -exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git +exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git,venv ignore = E402,E501,E731,E741 diff --git a/test/test_utils.py b/test/test_utils.py index 14503ab53..8da5ccc56 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -78,6 +78,7 @@ from youtube_dl.utils import ( uppercase_escape, lowercase_escape, url_basename, + url_or_none, base_url, urljoin, urlencode_postdata, @@ -361,6 +362,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8') + self.assertEqual(determine_ext('foobar', None), None) def test_find_xpath_attr(self): testxml = ''' @@ -506,6 +508,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') + def test_url_or_none(self): + self.assertEqual(url_or_none(None), None) + self.assertEqual(url_or_none(''), None) + self.assertEqual(url_or_none('foo'), None) + self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') + self.assertEqual(url_or_none('https://foo.de'), 'https://foo.de') + self.assertEqual(url_or_none('http$://foo.de'), None) + self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') + self.assertEqual(url_or_none('//foo.de'), '//foo.de') + def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) self.assertEqual(parse_age_limit(False), None) @@ -519,6 +531,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_age_limit('PG-13'), 13) self.assertEqual(parse_age_limit('TV-14'), 14) self.assertEqual(parse_age_limit('TV-MA'), 17) + self.assertEqual(parse_age_limit('TV14'), 14) + self.assertEqual(parse_age_limit('TV_G'), 0) def test_parse_duration(self): self.assertEqual(parse_duration(None), None) @@ -714,6 +728,10 @@ class TestUtil(unittest.TestCase): d = json.loads(stripped) self.assertEqual(d, {'status': 'success'}) + stripped = strip_jsonp('({"status": "success"});') + d = json.loads(stripped) + self.assertEqual(d, {'status': 'success'}) + def test_uppercase_escape(self): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a342e6b81..623c4db01 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -211,7 +211,7 @@ class YoutubeDL(object): At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use geo_verification_proxy: URL of the proxy to use for IP address verification - on geo-restricted sites. (Experimental) + on geo-restricted sites. socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi @@ -259,7 +259,7 @@ class YoutubeDL(object): - "warn": only emit a warning - "detect_or_warn": check whether we can do anything about it, warn otherwise (default) - source_address: (Experimental) Client-side IP address to bind to. + source_address: Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. sleep_interval: Number of seconds to sleep before each download when @@ -281,14 +281,14 @@ class YoutubeDL(object): match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For - HTTP header (experimental) + HTTP header geo_bypass_country: Two-letter ISO 3166-2 country code that will be used for explicit geographic restriction bypassing via faking - X-Forwarded-For HTTP header (experimental) + X-Forwarded-For HTTP header geo_bypass_ip_block: IP range in CIDR notation that will be used similarly to - geo_bypass_country (experimental) + geo_bypass_country The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. @@ -305,8 +305,8 @@ class YoutubeDL(object): http_chunk_size. The following options are used by the post processors: - prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, - otherwise prefer avconv. + prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. postprocessor_args: A list of additional command-line arguments for the postprocessor. diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 4bb8e8e2e..255758e28 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2787,6 +2787,12 @@ except NameError: # Python 3 compat_numeric_types = (int, float, complex) +try: + compat_integer_types = (int, long) +except NameError: # Python 3 + compat_integer_types = (int, ) + + if sys.version_info < (2, 7): def compat_socket_create_connection(address, timeout, source_address=None): host, port = address @@ -2974,6 +2980,7 @@ __all__ = [ 'compat_http_client', 'compat_http_server', 'compat_input', + 'compat_integer_types', 'compat_itertools_count', 'compat_kwargs', 'compat_numeric_types', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index dc82e9596..6cd0cb1fd 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -45,7 +45,6 @@ class FileDownloader(object): min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. - (experimental) external_downloader_args: A list of additional command-line arguments for the external downloader. hls_use_mpegts: Use the mpegts container for HLS videos. diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 576ece6db..eaa7adf7c 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .fragment import FragmentFD from ..compat import compat_urllib_error -from ..utils import urljoin +from ..utils import ( + DownloadError, + urljoin, +) class DashSegmentsFD(FragmentFD): @@ -57,6 +60,14 @@ class DashSegmentsFD(FragmentFD): count += 1 if count <= fragment_retries: self.report_retry_fragment(err, frag_index, count, fragment_retries) + except DownloadError: + # Don't retry fragment if error occurred during HTTP downloading + # itself since it has own retry settings + if not fatal: + self.report_skip_fragment(frag_index) + break + raise + if count > fragment_retries: if not fatal: self.report_skip_fragment(frag_index) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index a22875f69..5b1e96013 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -217,10 +217,11 @@ class HttpFD(FileDownloader): before = start # start measuring def retry(e): - if ctx.tmpfilename != '-': + to_stdout = ctx.tmpfilename == '-' + if not to_stdout: ctx.stream.close() ctx.stream = None - ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) raise RetryDownload(e) while True: diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index b823b5171..fbb7f51b0 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -29,66 +29,68 @@ class RtmpFD(FileDownloader): proc = subprocess.Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True proc_stderr_closed = False - while not proc_stderr_closed: - # read line from stderr - line = '' - while True: - char = proc.stderr.read(1) - if not char: - proc_stderr_closed = True - break - if char in [b'\r', b'\n']: - break - line += char.decode('ascii', 'replace') - if not line: - # proc_stderr_closed is True - continue - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) - if mobj: - downloaded_data_len = int(float(mobj.group(1)) * 1024) - percent = float(mobj.group(2)) - if not resume_percent: - resume_percent = percent - resume_downloaded_data_len = downloaded_data_len - time_now = time.time() - eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) - speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) - data_len = None - if percent > 0: - data_len = int(downloaded_data_len * 100 / percent) - self._hook_progress({ - 'status': 'downloading', - 'downloaded_bytes': downloaded_data_len, - 'total_bytes_estimate': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'elapsed': time_now - start, - 'speed': speed, - }) - cursor_in_new_line = False - else: - # no percent for live streams - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + try: + while not proc_stderr_closed: + # read line from stderr + line = '' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) if mobj: downloaded_data_len = int(float(mobj.group(1)) * 1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len time_now = time.time() - speed = self.calc_speed(start, time_now, downloaded_data_len) + eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) + speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) self._hook_progress({ + 'status': 'downloading', 'downloaded_bytes': downloaded_data_len, + 'total_bytes_estimate': data_len, 'tmpfilename': tmpfilename, 'filename': filename, - 'status': 'downloading', + 'eta': eta, 'elapsed': time_now - start, 'speed': speed, }) cursor_in_new_line = False - elif self.params.get('verbose', False): - if not cursor_in_new_line: - self.to_screen('') - cursor_in_new_line = True - self.to_screen('[rtmpdump] ' + line) - proc.wait() + else: + # no percent for live streams + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + time_now = time.time() + speed = self.calc_speed(start, time_now, downloaded_data_len) + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'elapsed': time_now - start, + 'speed': speed, + }) + cursor_in_new_line = False + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen('') + cursor_in_new_line = True + self.to_screen('[rtmpdump] ' + line) + finally: + proc.wait() if not cursor_in_new_line: self.to_screen('') return proc.returncode @@ -163,7 +165,15 @@ class RtmpFD(FileDownloader): RD_INCOMPLETE = 2 RD_NO_CONNECT = 3 - retval = run_rtmpdump(args) + started = time.time() + + try: + retval = run_rtmpdump(args) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + retval = RD_SUCCESS + self.to_screen('\n[rtmpdump] Interrupted by user') if retval == RD_NO_CONNECT: self.report_error('[rtmpdump] Could not connect to RTMP server.') @@ -171,7 +181,7 @@ class RtmpFD(FileDownloader): while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] %s bytes' % prevsize) + self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed args = basic_args + ['--resume'] if retval == RD_FAILED: @@ -188,13 +198,14 @@ class RtmpFD(FileDownloader): break if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] %s bytes' % fsize) + self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize) self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, 'total_bytes': fsize, 'filename': filename, 'status': 'finished', + 'elapsed': time.time() - started, }) return True else: diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 512f04684..4ac323bf6 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -105,22 +105,22 @@ class ABCIE(InfoExtractor): class ABCIViewIE(InfoExtractor): IE_NAME = 'abc.net.au:iview' - _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P[^/?#]+)' _GEO_COUNTRIES = ['AU'] # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'https://iview.abc.net.au/programs/ben-and-hollys-little-kingdom/ZY9247A021S00', + 'url': 'https://iview.abc.net.au/show/ben-and-hollys-little-kingdom/series/0/video/ZX9371A050S00', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'ZY9247A021S00', + 'id': 'ZX9371A050S00', 'ext': 'mp4', - 'title': "Gaston's Visit", + 'title': "Gaston's Birthday", 'series': "Ben And Holly's Little Kingdom", - 'description': 'md5:18db170ad71cf161e006a4c688e33155', - 'upload_date': '20180318', + 'description': 'md5:f9de914d02f226968f598ac76f105bcf', + 'upload_date': '20180604', 'uploader_id': 'abc4kids', - 'timestamp': 1521400959, + 'timestamp': 1528140219, }, 'params': { 'skip_download': True, @@ -129,17 +129,16 @@ class ABCIViewIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_params = self._parse_json(self._search_regex( - r'videoParams\s*=\s*({.+?});', webpage, 'video params'), video_id) - title = video_params.get('title') or video_params['seriesTitle'] - stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') + video_params = self._download_json( + 'https://iview.abc.net.au/api/programs/' + video_id, video_id) + title = unescapeHTML(video_params.get('title') or video_params['seriesTitle']) + stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream')) - house_number = video_params.get('episodeHouseNumber') - path = '/auth/hls/sign?ts={0}&hn={1}&d=android-mobile'.format( + house_number = video_params.get('episodeHouseNumber') or video_id + path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format( int(time.time()), house_number) sig = hmac.new( - 'android.content.res.Resources'.encode('utf-8'), + b'android.content.res.Resources', path.encode('utf-8'), hashlib.sha256).hexdigest() token = self._download_webpage( 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) @@ -169,18 +168,26 @@ class ABCIViewIE(InfoExtractor): 'ext': 'vtt', }] + is_live = video_params.get('livestream') == '1' + if is_live: + title = self._live_title(title) + return { 'id': video_id, - 'title': unescapeHTML(title), - 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), - 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), + 'title': title, + 'description': video_params.get('description'), + 'thumbnail': video_params.get('thumbnail'), 'duration': int_or_none(video_params.get('eventDuration')), 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), 'series': unescapeHTML(video_params.get('seriesTitle')), 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], - 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage, default=None)), - 'episode': self._html_search_meta('episode_title', webpage, default=None), + 'season_number': int_or_none(self._search_regex( + r'\bSeries\s+(\d+)\b', title, 'season number', default=None)), + 'episode_number': int_or_none(self._search_regex( + r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), + 'episode_id': house_number, 'uploader_id': video_params.get('channel'), 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 041c61aff..1eb99c39a 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import binascii import json import os +import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt @@ -12,9 +15,12 @@ from ..compat import ( ) from ..utils import ( bytes_to_intlist, + bytes_to_long, ExtractorError, float_or_none, intlist_to_bytes, + long_to_bytes, + pkcs1pad, srt_subtitles_timecode, strip_or_none, urljoin, @@ -35,6 +41,7 @@ class ADNIE(InfoExtractor): } } _BASE_URL = 'http://animedigitalnetwork.fr' + _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) def _get_subtitles(self, sub_path, video_id): if not sub_path: @@ -42,16 +49,14 @@ class ADNIE(InfoExtractor): enc_subtitles = self._download_webpage( urljoin(self._BASE_URL, sub_path), - video_id, fatal=False, headers={ - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', - }) + video_id, fatal=False) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(b'\xc8\x6e\x06\xbc\xbe\xc6\x49\xf5\x88\x0d\xc8\x47\xc4\x27\x0c\x60'), + bytes_to_intlist(binascii.unhexlify(self._K + '9032ad7083106400')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -112,11 +117,24 @@ class ADNIE(InfoExtractor): error = None if not links: links_url = player_config.get('linksurl') or options['videoUrl'] - links_data = self._download_json(urljoin( - self._BASE_URL, links_url), video_id) + token = options['token'] + self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + message = bytes_to_intlist(json.dumps({ + 'k': self._K, + 'e': 60, + 't': token, + })) + padded_message = intlist_to_bytes(pkcs1pad(message, 128)) + n, e = self._RSA_KEY + encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) + authorization = base64.b64encode(encrypted_message).decode() + links_data = self._download_json( + urljoin(self._BASE_URL, links_url), video_id, headers={ + 'Authorization': 'Bearer ' + authorization, + }) links = links_data.get('links') or {} metas = metas or links_data.get('meta') or {} - sub_path = sub_path or links_data.get('subtitles') + sub_path = (sub_path or links_data.get('subtitles')) + '&token=' + token error = links_data.get('error') title = metas.get('title') or video_info['title'] diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index acc4ce38d..88c96a950 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -7,6 +7,7 @@ from .turner import TurnerBaseIE from ..utils import ( int_or_none, strip_or_none, + url_or_none, ) @@ -98,7 +99,7 @@ class AdultSwimIE(TurnerBaseIE): if not video_id: entries = [] for episode in video_data.get('archiveEpisodes', []): - episode_url = episode.get('url') + episode_url = url_or_none(episode.get('url')) if not episode_url: continue entries.append(self.url_result( diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 4b3d97136..6275e5209 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + url_or_none, urlencode_postdata, xpath_text, ) @@ -304,7 +305,7 @@ class AfreecaTVIE(InfoExtractor): file_elements = video_element.findall(compat_xpath('./file')) one = len(file_elements) == 1 for file_num, file_element in enumerate(file_elements, start=1): - file_url = file_element.text + file_url = url_or_none(file_element.text) if not file_url: continue key = file_element.get('key', '') diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index fde1a8ff7..7ff098cfa 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -3,11 +3,12 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - int_or_none, - parse_iso8601, - mimetype2ext, determine_ext, ExtractorError, + int_or_none, + mimetype2ext, + parse_iso8601, + url_or_none, ) @@ -35,7 +36,7 @@ class AMPIE(InfoExtractor): media_thumbnail = [media_thumbnail] for thumbnail_data in media_thumbnail: thumbnail = thumbnail_data.get('@attributes', {}) - thumbnail_url = thumbnail.get('url') + thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue thumbnails.append({ @@ -51,7 +52,7 @@ class AMPIE(InfoExtractor): media_subtitle = [media_subtitle] for subtitle_data in media_subtitle: subtitle = subtitle_data.get('@attributes', {}) - subtitle_href = subtitle.get('href') + subtitle_href = url_or_none(subtitle.get('href')) if not subtitle_href: continue subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ @@ -65,7 +66,7 @@ class AMPIE(InfoExtractor): media_content = [media_content] for media_data in media_content: media = media_data.get('@attributes', {}) - media_url = media.get('url') + media_url = url_or_none(media.get('url')) if not media_url: continue ext = mimetype2ext(media.get('type')) or determine_ext(media_url) @@ -79,7 +80,7 @@ class AMPIE(InfoExtractor): else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), - 'url': media['url'], + 'url': media_url, 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), 'ext': ext, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index e4fa72f46..00ce684d1 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, extract_attributes, ExtractorError, + url_or_none, urlencode_postdata, urljoin, ) @@ -52,7 +53,7 @@ class AnimeOnDemandIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -165,7 +166,7 @@ class AnimeOnDemandIE(InfoExtractor): }, fatal=False) if not playlist: continue - stream_url = playlist.get('streamurl') + stream_url = url_or_none(playlist.get('streamurl')) if stream_url: rtmp = re.search( r'^(?Prtmpe?://(?P[^/]+)/(?P.+/))(?Pmp[34]:.+)', diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b50f454ee..cb9279193 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + url_or_none, ) @@ -77,7 +78,7 @@ class AolIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) for rendition in video_data.get('renditions', []): - video_url = rendition.get('url') + video_url = url_or_none(rendition.get('url')) if not video_url: continue ext = rendition.get('format') diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py new file mode 100644 index 000000000..98ccdaa4a --- /dev/null +++ b/youtube_dl/extractor/apa.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, + url_or_none, +) + + +class APAIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', + 'md5': '2b12292faeb0a7d930c778c7a5b4759b', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + }, { + 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', + 'only_matching': True, + }, { + 'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76', + 'only_matching': True, + }, { + 'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + jwplatform_id = self._search_regex( + r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, + 'jwplatform id', default=None) + + if jwplatform_id: + return self.url_result( + 'jwplatform:' + jwplatform_id, ie='JWPlatform', + video_id=video_id) + + sources = self._parse_json( + self._search_regex( + r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), + video_id, transform_source=js_to_json) + + formats = [] + for source in sources: + if not isinstance(source, dict): + continue + source_url = url_or_none(source.get('file')) + if not source_url: + continue + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': source_url, + }) + self._sort_formats(formats) + + thumbnail = self._search_regex( + r'image\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='url') + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index e394cb661..6eb8bbb6e 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, mimetype2ext, + url_or_none, ) @@ -43,7 +44,7 @@ class AparatIE(InfoExtractor): formats = [] for item in file_list[0]: - file_url = item.get('file') + file_url = url_or_none(item.get('file')) if not file_url: continue ext = mimetype2ext(item.get('type')) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 86951d975..23f574d36 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from .generic import GenericIE -from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, @@ -15,6 +14,7 @@ from ..utils import ( unified_strdate, xpath_text, update_url_query, + url_or_none, ) from ..compat import compat_etree_fromstring @@ -100,7 +100,7 @@ class ARDMediathekIE(InfoExtractor): quality = stream.get('_quality') server = stream.get('_server') for stream_url in stream_urls: - if not isinstance(stream_url, compat_str) or '//' not in stream_url: + if not url_or_none(stream_url): continue ext = determine_ext(stream_url) if quality != 'auto' and ext in ('f4m', 'm3u8'): diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 1a31ebe08..ae1c09427 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -74,7 +74,7 @@ class AtresPlayerIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index aa6925623..6bd48ef15 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -5,13 +5,12 @@ from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, - sanitized_Request, ) class AudiMediaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P[^/?#]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?:video/)?(?P[^/?#]+)' + _TESTS = [{ 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { @@ -24,41 +23,46 @@ class AudiMediaIE(InfoExtractor): 'duration': 74022, 'view_count': int, } - } - # extracted from https://audimedia.tv/assets/embed/embedded-player.js (dataSourceAuthToken) - _AUTH_TOKEN = 'e25b42847dba18c6c8816d5d8ce94c326e06823ebf0859ed164b3ba169be97f2' + }, { + 'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) raw_payload = self._search_regex([ - r'class="amtv-embed"[^>]+id="([^"]+)"', - r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"', + r'class="amtv-embed"[^>]+id="([0-9a-z-]+)"', + r'id="([0-9a-z-]+)"[^>]+class="amtv-embed"', + r'class=\\"amtv-embed\\"[^>]+id=\\"([0-9a-z-]+)\\"', + r'id=\\"([0-9a-z-]+)\\"[^>]+class=\\"amtv-embed\\"', + r'id=(?:\\)?"(amtve-[a-z]-\d+-[a-z]{2})', ], webpage, 'raw payload') - _, stage_mode, video_id, lang = raw_payload.split('-') + _, stage_mode, video_id, _ = raw_payload.split('-') # TODO: handle s and e stage_mode (live streams and ended live streams) if stage_mode not in ('s', 'e'): - request = sanitized_Request( - 'https://audimedia.tv/api/video/v1/videos/%s?embed[]=video_versions&embed[]=thumbnail_image&where[content_language_iso]=%s' % (video_id, lang), - headers={'X-Auth-Token': self._AUTH_TOKEN}) - json_data = self._download_json(request, video_id)['results'] + video_data = self._download_json( + 'https://www.audimedia.tv/api/video/v1/videos/' + video_id, + video_id, query={ + 'embed[]': ['video_versions', 'thumbnail_image'], + })['results'] formats = [] - stream_url_hls = json_data.get('stream_url_hls') + stream_url_hls = video_data.get('stream_url_hls') if stream_url_hls: formats.extend(self._extract_m3u8_formats( stream_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - stream_url_hds = json_data.get('stream_url_hds') + stream_url_hds = video_data.get('stream_url_hds') if stream_url_hds: formats.extend(self._extract_f4m_formats( stream_url_hds + '?hdcore=3.4.0', video_id, f4m_id='hds', fatal=False)) - for video_version in json_data.get('video_versions'): + for video_version in video_data.get('video_versions', []): video_version_url = video_version.get('download_url') or video_version.get('stream_url') if not video_version_url: continue @@ -79,11 +83,11 @@ class AudiMediaIE(InfoExtractor): return { 'id': video_id, - 'title': json_data['title'], - 'description': json_data.get('subtitle'), - 'thumbnail': json_data.get('thumbnail_image', {}).get('file'), - 'timestamp': parse_iso8601(json_data.get('publication_date')), - 'duration': int_or_none(json_data.get('duration')), - 'view_count': int_or_none(json_data.get('view_count')), + 'title': video_data['title'], + 'description': video_data.get('subtitle'), + 'thumbnail': video_data.get('thumbnail_image', {}).get('file'), + 'timestamp': parse_iso8601(video_data.get('publication_date')), + 'duration': int_or_none(video_data.get('duration')), + 'view_count': int_or_none(video_data.get('view_count')), 'formats': formats, } diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index f3bd4d444..62049b921 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -65,7 +65,7 @@ class AudiomackIE(InfoExtractor): return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'} return { - 'id': api_response.get('id', album_url_tag), + 'id': compat_str(api_response.get('id', album_url_tag)), 'uploader': api_response.get('artist'), 'title': api_response.get('title'), 'url': api_response['url'], diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 633c57553..34f1b3d83 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -44,7 +44,7 @@ class BambuserIE(InfoExtractor): } def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index be41bd5a2..b8514734d 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -19,6 +19,7 @@ from ..utils import ( unescapeHTML, update_url_query, unified_strdate, + url_or_none, ) @@ -131,8 +132,8 @@ class BandcampIE(InfoExtractor): fatal=False) if not stat: continue - retry_url = stat.get('retry_url') - if not isinstance(retry_url, compat_str): + retry_url = url_or_none(stat.get('retry_url')) + if not retry_url: continue formats.append({ 'url': self._proto_relative_url(retry_url, 'http:'), @@ -306,7 +307,7 @@ class BandcampWeeklyIE(InfoExtractor): formats = [] for format_id, format_url in show['audio_stream'].items(): - if not isinstance(format_url, compat_str): + if not url_or_none(format_url): continue for known_ext in KNOWN_EXTENSIONS: if known_ext in format_id: diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 8b20c03d6..abcfa301d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -12,6 +12,7 @@ from ..utils import ( float_or_none, get_element_by_class, int_or_none, + js_to_json, parse_duration, parse_iso8601, try_get, @@ -20,7 +21,6 @@ from ..utils import ( urljoin, ) from ..compat import ( - compat_etree_fromstring, compat_HTTPError, compat_urlparse, ) @@ -29,7 +29,7 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'[pbw][\da-z]{7}' + _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -236,6 +236,12 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/m00005xn', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s', + 'only_matching': True, }] _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' @@ -333,14 +339,9 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - try: - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404): - media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) - else: - raise + media_selection = self._download_xml( + url, programme_id, 'Downloading media selection XML', + expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) def _process_media_selector(self, media_selection, programme_id): @@ -772,6 +773,28 @@ class BBCIE(BBCCoUkIE): # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', + 'info_dict': { + 'id': 'p06556y7', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + }, + 'params': { + 'skip_download': True, + } + }, { + # window.__PRELOADED_STATE__ + 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', + 'info_dict': { + 'id': 'b0b9z4vz', + 'ext': 'mp4', + 'title': 'Prom 6: An American in Paris and Turangalila', + 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8', + 'uploader': 'Radio 3', + 'uploader_id': 'bbc_radio_three', + }, }] @classmethod @@ -994,6 +1017,66 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + preload_state = self._parse_json(self._search_regex( + r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if preload_state: + current_programme = preload_state.get('programmes', {}).get('current') or {} + programme_id = current_programme.get('id') + if current_programme and programme_id and current_programme.get('type') == 'playable_item': + title = current_programme.get('titles', {}).get('tertiary') or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + synopses = current_programme.get('synopses') or {} + network = current_programme.get('network') or {} + duration = int_or_none( + current_programme.get('duration', {}).get('value')) + thumbnail = None + image_url = current_programme.get('image_url') + if image_url: + thumbnail = image_url.replace('{recipe}', '1920x1920') + return { + 'id': programme_id, + 'title': title, + 'description': dict_get(synopses, ('long', 'medium', 'short')), + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': network.get('short_title'), + 'uploader_id': network.get('id'), + 'formats': formats, + 'subtitles': subtitles, + } + + bbc3_config = self._parse_json( + self._search_regex( + r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, + 'bbcthree config', default='{}'), + playlist_id, transform_source=js_to_json, fatal=False) + if bbc3_config: + bbc3_playlist = try_get( + bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + dict) + if bbc3_playlist: + playlist_title = bbc3_playlist.get('title') or playlist_title + thumbnail = bbc3_playlist.get('holdingImageURL') + entries = [] + for bbc3_item in bbc3_playlist['items']: + programme_id = bbc3_item.get('versionID') + if not programme_id: + continue + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': playlist_title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 8820a3914..f36a2452d 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -12,7 +12,7 @@ class BellMediaIE(InfoExtractor): (?: ctv| tsn| - bnn| + bnn(?:bloomberg)?| thecomedynetwork| discovery| discoveryvelocity| @@ -27,17 +27,16 @@ class BellMediaIE(InfoExtractor): much\.com )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' _TESTS = [{ - 'url': 'http://www.ctv.ca/video/player?vid=706966', - 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', + 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', + 'md5': '36d3ef559cfe8af8efe15922cd3ce950', 'info_dict': { - 'id': '706966', - 'ext': 'mp4', - 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', - 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', - 'upload_date': '20150919', - 'timestamp': 1442624700, + 'id': '1403070', + 'ext': 'flv', + 'title': 'David Cockfield\'s Top Picks', + 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', + 'upload_date': '20180525', + 'timestamp': 1527288600, }, - 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', 'only_matching': True, @@ -70,6 +69,7 @@ class BellMediaIE(InfoExtractor): 'investigationdiscovery': 'invdisc', 'animalplanet': 'aniplan', 'etalk': 'ctv', + 'bnnbloomberg': 'bnn', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 3e3348ef5..4d6b051fe 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -114,7 +114,7 @@ class BiliBiliIE(InfoExtractor): if 'anime/' not in url: cid = self._search_regex( - r'cid(?:["\']:|=)(\d+)', webpage, 'cid', + r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', default=None ) or compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py new file mode 100644 index 000000000..da263714a --- /dev/null +++ b/youtube_dl/extractor/bitchute.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import urlencode_postdata + + +class BitChuteIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.bitchute.com/video/szoMrox2JEI/', + 'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb', + 'info_dict': { + 'id': 'szoMrox2JEI', + 'ext': 'mp4', + 'title': 'Fuck bitches get money', + 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Victoria X Rave', + }, + }, { + 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', + 'only_matching': True, + }, { + 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.bitchute.com/video/%s' % video_id, video_id) + + title = self._search_regex( + (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'([^<]+)'), + webpage, 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', + default=None) or self._og_search_description(webpage) + + formats = [ + {'url': mobj.group('url')} + for mobj in re.finditer( + r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage)] + self._sort_formats(formats) + + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image:src', webpage, 'thumbnail') + uploader = self._html_search_regex( + r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>', webpage, + 'uploader', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'formats': formats, + } + + +class BitChuteChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.bitchute.com/channel/victoriaxrave/', + 'playlist_mincount': 185, + 'info_dict': { + 'id': 'victoriaxrave', + }, + } + + _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' + + def _entries(self, channel_id): + channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id + offset = 0 + for page_num in itertools.count(1): + data = self._download_json( + '%sextend/' % channel_url, channel_id, + 'Downloading channel page %d' % page_num, + data=urlencode_postdata({ + 'csrfmiddlewaretoken': self._TOKEN, + 'name': '', + 'offset': offset, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': channel_url, + 'X-Requested-With': 'XMLHttpRequest', + 'Cookie': 'csrftoken=%s' % self._TOKEN, + }) + if data.get('success') is False: + break + html = data.get('html') + if not html: + break + video_ids = re.findall( + r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', + html) + if not video_ids: + break + offset += len(video_ids) + for video_id in video_ids: + yield self.url_result( + 'https://www.bitchute.com/video/%s' % video_id, + ie=BitChuteIE.ie_key(), video_id=video_id) + + def _real_extract(self, url): + channel_id = self._match_id(url) + return self.playlist_result( + self._entries(channel_id), playlist_id=channel_id) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 70d16767f..68c7cf2bb 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -4,8 +4,10 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE -from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + int_or_none, + url_or_none, +) class BreakIE(InfoExtractor): @@ -55,8 +57,8 @@ class BreakIE(InfoExtractor): formats = [] for video in content: - video_url = video.get('url') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(video.get('url')) + if not video_url: continue bitrate = int_or_none(self._search_regex( r'(\d+)_kbps', video_url, 'tbr', default=None)) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ab62e54d6..14f9a14ed 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -572,7 +572,8 @@ class BrightcoveNewIE(AdobePassIE): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - if ext == 'ism' or container == 'WVM': + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if ext == 'ism' or container == 'WVM' or source.get('key_systems'): continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -629,6 +630,14 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) + if not formats: + # for sonyliv.com DRM protected videos + s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') + if s3_source_url: + formats.append({ + 'url': s3_source_url, + 'format_id': 'source', + }) errors = json_data.get('errors') if not formats and errors: diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py new file mode 100644 index 000000000..79350817f --- /dev/null +++ b/youtube_dl/extractor/cammodels.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + url_or_none, +) + + +class CamModelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cammodels.com/cam/AutumnKnight/', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage( + url, user_id, headers=self.geo_verification_headers()) + + manifest_root = self._html_search_regex( + r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) + + if not manifest_root: + ERRORS = ( + ("I'm offline, but let's stay connected", 'This user is currently offline'), + ('in a private show', 'This user is in a private show'), + ('is currently performing LIVE', 'This model is currently performing live'), + ) + for pattern, message in ERRORS: + if pattern in webpage: + error = message + expected = True + break + else: + error = 'Unable to find manifest URL root' + expected = False + raise ExtractorError(error, expected=expected) + + manifest = self._download_json( + '%s%s.json' % (manifest_root, user_id), user_id) + + formats = [] + for format_id, format_dict in manifest['formats'].items(): + if not isinstance(format_dict, dict): + continue + encodings = format_dict.get('encodings') + if not isinstance(encodings, list): + continue + vcodec = format_dict.get('videoCodec') + acodec = format_dict.get('audioCodec') + for media in encodings: + if not isinstance(media, dict): + continue + media_url = url_or_none(media.get('location')) + if not media_url: + continue + + format_id_list = [format_id] + height = int_or_none(media.get('videoHeight')) + if height is not None: + format_id_list.append('%dp' % height) + f = { + 'url': media_url, + 'format_id': '-'.join(format_id_list), + 'width': int_or_none(media.get('videoWidth')), + 'height': height, + 'vbr': int_or_none(media.get('videoKbps')), + 'abr': int_or_none(media.get('audioKbps')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': vcodec, + 'acodec': acodec, + } + if 'rtmp' in format_id: + f['ext'] = 'flv' + elif 'hls' in format_id: + f.update({ + 'ext': 'mp4', + # hls skips fragments, preferring rtmp + 'preference': -1, + }) + else: + continue + formats.append(f) + self._sort_formats(formats) + + return { + 'id': user_id, + 'title': self._live_title(user_id), + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py new file mode 100644 index 000000000..c7d40f849 --- /dev/null +++ b/youtube_dl/extractor/camtube.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class CamTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', + 'info_dict': { + 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', + 'display_id': 'minafay-030618-1136-chaturbate-female', + 'ext': 'mp4', + 'title': 'minafay-030618-1136-chaturbate-female', + 'duration': 1274, + 'timestamp': 1528018608, + 'upload_date': '20180603', + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_BASE = 'https://api.camtube.co' + + def _real_extract(self, url): + display_id = self._match_id(url) + + token = self._download_json( + '%s/rpc/session/new' % self._API_BASE, display_id, + 'Downloading session token')['token'] + + self._set_cookie('api.camtube.co', 'session', token) + + video = self._download_json( + '%s/recordings/%s' % (self._API_BASE, display_id), display_id, + headers={'Referer': url}) + + video_id = video['uuid'] + timestamp = unified_timestamp(video.get('createdAt')) + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('likeCount')) + creator = video.get('stageName') + + formats = [{ + 'url': '%s/recordings/%s/manifest.m3u8' + % (self._API_BASE, video_id), + 'format_id': 'hls', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'creator': creator, + 'formats': formats, + } diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8ac62c1a6..174fd9e2b 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -11,6 +11,7 @@ from ..utils import ( strip_or_none, float_or_none, int_or_none, + merge_dicts, parse_iso8601, ) @@ -248,9 +249,13 @@ class VrtNUIE(GigyaBaseIE): webpage, urlh = self._download_webpage_handle(url, display_id) - title = self._html_search_regex( + info = self._search_json_ld(webpage, display_id, default={}) + + # title is optional here since it may be extracted by extractor + # that is delegated from here + title = strip_or_none(self._html_search_regex( r'(?ms)<h1 class="content__heading">(.+?)</h1>', - webpage, 'title').strip() + webpage, 'title', default=None)) description = self._html_search_regex( r'(?ms)<div class="content__description">(.+?)</div>', @@ -295,7 +300,7 @@ class VrtNUIE(GigyaBaseIE): # the first one video_id = list(video.values())[0].get('videoid') - return { + return merge_dicts(info, { '_type': 'url_transparent', 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 'ie_key': CanvasIE.ie_key(), @@ -307,4 +312,4 @@ class VrtNUIE(GigyaBaseIE): 'season_number': season_number, 'episode_number': episode_number, 'release_date': release_date, - } + }) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 54b4b9be9..43f95c739 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -17,9 +17,11 @@ from ..utils import ( xpath_element, xpath_with_ns, find_xpath_attr, + orderedSet, parse_duration, parse_iso8601, parse_age_limit, + strip_or_none, int_or_none, ExtractorError, ) @@ -129,15 +131,23 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( + r'<title>([^<]+)', webpage, 'title', fatal=False) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + media_ids = [] + for media_id_re in ( + r']+src="[^"]+?mediaId=(\d+)"', + r']+\bid=["\']player-(\d+)', + r'guid["\']\s*:\s*["\'](\d+)'): + media_ids.extend(re.findall(media_id_re, webpage)) entries.extend([ self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) - for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) + for media_id in orderedSet(media_ids)]) return self.playlist_result( - entries, display_id, - self._og_search_title(webpage, fatal=False), + entries, display_id, strip_or_none(title), self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 07f5206c1..544647f92 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, int_or_none, parse_duration, parse_iso8601, parse_resolution, + url_or_none, ) @@ -53,8 +53,8 @@ class CCMAIE(InfoExtractor): media_url = media['media']['url'] if isinstance(media_url, list): for format_ in media_url: - format_url = format_.get('file') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_.get('file')) + if not format_url: continue label = format_.get('label') f = parse_resolution(label) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 6bad90859..46380430f 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -108,7 +108,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index e3eba4be9..e2b828d8a 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -31,7 +31,8 @@ class ChaturbateIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + url, video_id, headers=self.geo_verification_headers()) m3u8_urls = [] diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py index 57e643799..06d04de13 100644 --- a/youtube_dl/extractor/clyp.py +++ b/youtube_dl/extractor/clyp.py @@ -1,15 +1,19 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( float_or_none, - parse_iso8601, + unified_timestamp, ) class ClypIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P[a-z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://clyp.it/ojz2wfah', 'md5': '1d4961036c41247ecfdcc439c0cddcbb', 'info_dict': { @@ -21,13 +25,34 @@ class ClypIE(InfoExtractor): 'timestamp': 1443515251, 'upload_date': '20150929', }, - } + }, { + 'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d', + 'info_dict': { + 'id': 'b04p1odi', + 'ext': 'mp3', + 'title': 'GJ! (Reward Edit)', + 'description': 'Metal Resistance (THE ONE edition)', + 'duration': 177.789, + 'timestamp': 1528241278, + 'upload_date': '20180605', + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): audio_id = self._match_id(url) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + token = qs.get('token', [None])[0] + + query = {} + if token: + query['token'] = token + metadata = self._download_json( - 'https://api.clyp.it/%s' % audio_id, audio_id) + 'https://api.clyp.it/%s' % audio_id, audio_id, query=query) formats = [] for secure in ('', 'Secure'): @@ -45,7 +70,7 @@ class ClypIE(InfoExtractor): title = metadata['Title'] description = metadata.get('Description') duration = float_or_none(metadata.get('Duration')) - timestamp = parse_iso8601(metadata.get('DateCreated')) + timestamp = unified_timestamp(metadata.get('DateCreated')) return { 'id': audio_id, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e3e1dbd87..926e70692 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -19,6 +19,7 @@ from ..compat import ( compat_cookies, compat_etree_fromstring, compat_getpass, + compat_integer_types, compat_http_client, compat_os_name, compat_str, @@ -51,6 +52,7 @@ from ..utils import ( GeoUtils, int_or_none, js_to_json, + JSON_LD_RE, mimetype2ext, orderedSet, parse_codecs, @@ -339,20 +341,17 @@ class InfoExtractor(object): _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. Though it won't disable explicit geo restriction bypass based on - country code provided with geo_bypass_country. (experimental) + country code provided with geo_bypass_country. _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted countries for this extractor. One of these countries will be used by geo restriction bypass mechanism right away in order to bypass - geo restriction, of course, if the mechanism is not disabled. (experimental) + geo restriction, of course, if the mechanism is not disabled. _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted IP blocks in CIDR notation for this extractor. One of these IP blocks will be used by geo restriction bypass mechanism similarly - to _GEO_COUNTRIES. (experimental) - - NB: both these geo attributes are experimental and may change in future - or be completely removed. + to _GEO_COUNTRIES. Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. @@ -551,8 +550,26 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): - """ Returns the response handle """ + @staticmethod + def __can_accept_status_code(err, expected_status): + assert isinstance(err, compat_urllib_error.HTTPError) + if expected_status is None: + return False + if isinstance(expected_status, compat_integer_types): + return err.code == expected_status + elif isinstance(expected_status, (list, tuple)): + return err.code in expected_status + elif callable(expected_status): + return expected_status(err.code) is True + else: + assert False + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + """ + Return the response handle. + + See _download_webpage docstring for arguments specification. + """ if note is None: self.report_download_webpage(video_id) elif note is not False: @@ -581,6 +598,10 @@ class InfoExtractor(object): try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if isinstance(err, compat_urllib_error.HTTPError): + if self.__can_accept_status_code(err, expected_status): + return err.fp + if errnote is False: return False if errnote is None: @@ -593,13 +614,17 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): - """ Returns a tuple (page content as string, URL handle) """ + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + """ + Return a tuple (page content as string, URL handle). + + See _download_webpage docstring for arguments specification. + """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) if urlh is False: assert not fatal return False @@ -688,13 +713,52 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): - """ Returns the data of the page as a string """ + def _download_webpage( + self, url_or_request, video_id, note=None, errnote=None, + fatal=True, tries=1, timeout=5, encoding=None, data=None, + headers={}, query={}, expected_status=None): + """ + Return the data of the page as a string. + + Arguments: + url_or_request -- plain text URL as a string or + a compat_urllib_request.Requestobject + video_id -- Video/playlist/item identifier (string) + + Keyword arguments: + note -- note printed before downloading (string) + errnote -- note printed in case of an error (string) + fatal -- flag denoting whether error should be considered fatal, + i.e. whether it should cause ExtractionError to be raised, + otherwise a warning will be reported and extraction continued + tries -- number of tries + timeout -- sleep interval between tries + encoding -- encoding for a page content decoding, guessed automatically + when not explicitly specified + data -- POST data (bytes) + headers -- HTTP headers (dict) + query -- URL query (dict) + expected_status -- allows to accept failed HTTP requests (non 2xx + status code) by explicitly specifying a set of accepted status + codes. Can be any of the following entities: + - an integer type specifying an exact failed status code to + accept + - a list or a tuple of integer types specifying a list of + failed status codes to accept + - a callable accepting an actual failed status code and + returning True if it should be accepted + Note that this argument does not affect success status codes (2xx) + which are always accepted. + """ + success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -710,11 +774,17 @@ class InfoExtractor(object): def _download_xml_handle( self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res xml_string, urlh = res @@ -722,15 +792,21 @@ class InfoExtractor(object): xml_string, video_id, transform_source=transform_source, fatal=fatal), urlh - def _download_xml(self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}): - """Return the xml as an xml.etree.ElementTree.Element""" + def _download_xml( + self, url_or_request, video_id, + note='Downloading XML', errnote='Unable to download XML', + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}, expected_status=None): + """ + Return the xml as an xml.etree.ElementTree.Element. + + See _download_webpage docstring for arguments specification. + """ res = self._download_xml_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): @@ -748,11 +824,17 @@ class InfoExtractor(object): def _download_json_handle( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (JSON object, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (JSON object, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res json_string, urlh = res @@ -763,11 +845,18 @@ class InfoExtractor(object): def _download_json( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return the JSON object as a dict. + + See _download_webpage docstring for arguments specification. + """ res = self._download_json_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): @@ -1058,8 +1147,7 @@ class InfoExtractor(object): def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( - r'(?s)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)', - html, 'JSON-LD', group='json_ld', **kwargs) + JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) default = kwargs.get('default', NO_DEFAULT) if not json_ld: return default if default is not NO_DEFAULT else {} @@ -1768,9 +1856,7 @@ class InfoExtractor(object): 'height': height, }) formats.extend(m3u8_formats) - continue - - if src_ext == 'f4m': + elif src_ext == 'f4m': f4m_url = src_url if not f4m_params: f4m_params = { @@ -1780,9 +1866,13 @@ class InfoExtractor(object): f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) - continue - - if src_url.startswith('http') and self._is_valid_url(src, video_id): + elif src_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src_url, video_id, mpd_id='dash', fatal=False)) + elif re.search(r'\.ism/[Mm]anifest', src_url): + formats.extend(self._extract_ism_formats( + src_url, video_id, ism_id='mss', fatal=False)) + elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, @@ -1793,7 +1883,6 @@ class InfoExtractor(object): 'width': width, 'height': height, }) - continue return formats @@ -2015,7 +2104,21 @@ class InfoExtractor(object): representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): - t = representation_ms_info[template_name] + tmpl = representation_ms_info[template_name] + # First of, % characters outside $...$ templates + # must be escaped by doubling for proper processing + # by % operator string formatting used further (see + # https://github.com/rg3/youtube-dl/issues/16867). + t = '' + in_template = False + for c in tmpl: + t += c + if c == '$': + in_template = not in_template + elif c == '%' and not in_template: + t += c + # Next, $...$ templates are translated to their + # %(...) counterparts to be used with % operator t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) @@ -2346,6 +2449,8 @@ class InfoExtractor(object): media_info['subtitles'].setdefault(lang, []).append({ 'url': absolute_url(src), }) + for f in media_info['formats']: + f.setdefault('http_headers', {})['Referer'] = base_url if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index fc014f8b5..8dd9d6687 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -4,23 +4,21 @@ from __future__ import unicode_literals, division import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_HTTPError from ..utils import ( determine_ext, float_or_none, int_or_none, parse_age_limit, parse_duration, + url_or_none, ExtractorError ) class CrackleIE(InfoExtractor): - _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' - _TEST = { + _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' + _TESTS = [{ # geo restricted to CA 'url': 'https://www.crackle.com/andromeda/2502343', 'info_dict': { @@ -45,7 +43,10 @@ class CrackleIE(InfoExtractor): # m3u8 download 'skip_download': True, } - } + }, { + 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -83,8 +84,8 @@ class CrackleIE(InfoExtractor): for e in media['MediaURLs']: if e.get('UseDRM') is True: continue - format_url = e.get('Path') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(e.get('Path')) + if not format_url: continue ext = determine_ext(format_url) if ext == 'm3u8': @@ -121,8 +122,8 @@ class CrackleIE(InfoExtractor): for cc_file in cc_files: if not isinstance(cc_file, dict): continue - cc_url = cc_file.get('Path') - if not cc_url or not isinstance(cc_url, compat_str): + cc_url = url_or_none(cc_file.get('Path')) + if not cc_url: continue lang = cc_file.get('Locale') or 'en' subtitles.setdefault(lang, []).append({'url': cc_url}) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 3efdc8c21..463f995c7 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -49,7 +49,7 @@ class CrunchyrollBaseIE(InfoExtractor): }) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -262,6 +262,9 @@ class CrunchyrollIE(CrunchyrollBaseIE): # Just test metadata extraction 'skip_download': True, }, + }, { + 'url': 'http://www.crunchyroll.com/media-723735', + 'only_matching': True, }] _FORMAT_IDS = { @@ -580,7 +583,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = 'crunchyroll:playlist' - _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P[\w\-]+))/?(?:\?|$)' + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P[\w\-]+))/?(?:\?|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py index 55a127b76..03f8cefb7 100644 --- a/youtube_dl/extractor/ctvnews.py +++ b/youtube_dl/extractor/ctvnews.py @@ -11,10 +11,10 @@ class CTVNewsIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P[0-9.]+)' _TESTS = [{ 'url': 'http://www.ctvnews.ca/video?clipId=901995', - 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', + 'md5': '9b8624ba66351a23e0b6e1391971f9af', 'info_dict': { 'id': '901995', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Extended: \'That person cannot be me\' Johnson says', 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', 'timestamp': 1467286284, diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index 8e45923e3..35b1e7a34 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -35,7 +35,7 @@ class CuriosityStreamBaseIE(InfoExtractor): return result['data'] def _real_initialize(self): - (email, password) = self._get_login_info() + email, password = self._get_login_info() if email is None: return result = self._download_json( diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index f4cf0f1c5..224a1fb5d 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + parse_age_limit, parse_iso8601, + smuggle_url, + str_or_none, ) @@ -40,10 +43,15 @@ class CWTVIE(InfoExtractor): 'duration': 1263, 'series': 'Whose Line Is It Anyway?', 'season_number': 11, - 'season': '11', 'episode_number': 20, 'upload_date': '20151006', 'timestamp': 1444107300, + 'age_limit': 14, + 'uploader': 'CWTV', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, { 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', @@ -58,60 +66,28 @@ class CWTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = None - formats = [] - for partner in (154, 213): - vdata = self._download_json( - 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/%d?format=json' % (video_id, partner), video_id, fatal=False) - if not vdata: - continue - video_data = vdata - for quality, quality_data in vdata.get('videos', {}).items(): - quality_url = quality_data.get('uri') - if not quality_url: - continue - if quality == 'variantplaylist': - formats.extend(self._extract_m3u8_formats( - quality_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - else: - tbr = int_or_none(quality_data.get('bitrate')) - format_id = 'http' + ('-%d' % tbr if tbr else '') - if self._is_valid_url(quality_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': quality_url, - 'tbr': tbr, - }) - video_metadata = video_data['assetFields'] - ism_url = video_metadata.get('smoothStreamingUrl') - if ism_url: - formats.extend(self._extract_ism_formats( - ism_url, video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) + video_data = self._download_json( + 'http://images.cwtv.com/feed/mobileapp/video-meta/apiversion_8/guid_' + video_id, + video_id)['video'] + title = video_data['title'] + mpx_url = video_data.get('mpx_url') or 'http://link.theplatform.com/s/cwtv/media/guid/2703454149/%s?formats=M3U' % video_id - thumbnails = [{ - 'url': image['uri'], - 'width': image.get('width'), - 'height': image.get('height'), - } for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None - - subtitles = { - 'en': [{ - 'url': video_metadata['UnicornCcUrl'], - }], - } if video_metadata.get('UnicornCcUrl') else None + season = str_or_none(video_data.get('season')) + episode = str_or_none(video_data.get('episode')) + if episode and season: + episode = episode.lstrip(season) return { + '_type': 'url_transparent', 'id': video_id, - 'title': video_metadata['title'], - 'description': video_metadata.get('description'), - 'duration': int_or_none(video_metadata.get('duration')), - 'series': video_metadata.get('seriesName'), - 'season_number': int_or_none(video_metadata.get('seasonNumber')), - 'season': video_metadata.get('seasonName'), - 'episode_number': int_or_none(video_metadata.get('episodeNumber')), - 'timestamp': parse_iso8601(video_data.get('startTime')), - 'thumbnails': thumbnails, - 'formats': formats, - 'subtitles': subtitles, + 'title': title, + 'url': smuggle_url(mpx_url, {'force_smil_url': True}), + 'description': video_data.get('description_long'), + 'duration': int_or_none(video_data.get('duration_secs')), + 'series': video_data.get('series_name'), + 'season_number': int_or_none(season), + 'episode_number': int_or_none(episode), + 'timestamp': parse_iso8601(video_data.get('start_time')), + 'age_limit': parse_age_limit(video_data.get('rating')), + 'ie_key': 'ThePlatform', } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 0e7d587dd..040f0bd02 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,22 +1,29 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json +import base64 +import functools +import hashlib import itertools +import json +import random +import re +import string from .common import InfoExtractor - +from ..compat import compat_struct_pack from ..utils import ( determine_ext, error_to_compat_str, ExtractorError, int_or_none, + mimetype2ext, + OnDemandPagedList, parse_iso8601, sanitized_Request, str_to_int, unescapeHTML, - mimetype2ext, + urlencode_postdata, ) @@ -64,7 +71,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'Deadline', 'uploader_id': 'x1xm8ri', 'age_limit': 0, - 'view_count': int, }, }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', @@ -141,7 +147,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): age_limit = self._rta_search(webpage) - description = self._og_search_description(webpage) or self._html_search_meta( + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( 'description', webpage, 'description') view_count_str = self._search_regex( @@ -167,6 +174,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor): player = self._parse_json(player_v5, video_id) metadata = player['metadata'] + if metadata.get('error', {}).get('type') == 'password_protected': + password = self._downloader.params.get('videopassword') + if password: + r = int(metadata['id'][1:], 36) + us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=') + t = ''.join(random.choice(string.ascii_letters) for i in range(10)) + n = us64e(compat_struct_pack('I', r)) + i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest()) + metadata = self._download_json( + 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id) + self._check_error(metadata) formats = [] @@ -180,9 +198,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor): continue ext = mimetype2ext(type_) or determine_ext(media_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( media_url, video_id, 'mp4', preference=-1, - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + for f in m3u8_formats: + f['url'] = f['url'].split('#')[0] + formats.append(f) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) @@ -299,8 +320,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _check_error(self, info): error = info.get('error') - if info.get('error') is not None: - title = error['title'] + if error: + title = error.get('title') or error['message'] # See https://developer.dailymotion.com/api#access-error if error.get('code') == 'DM007': self.raise_geo_restricted(msg=title) @@ -325,17 +346,93 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P[^/?#&]+)' - _MORE_PAGES_INDICATOR = r'(?s)
.*?x[0-9a-z]+)' _TESTS = [{ 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', 'info_dict': { 'title': 'SPORT', - 'id': 'xv4bw_nqtv_sport', + 'id': 'xv4bw', }, 'playlist_mincount': 20, }] + _PAGE_SIZE = 100 + + def _fetch_page(self, playlist_id, authorizaion, page): + page += 1 + videos = self._download_json( + 'https://graphql.api.dailymotion.com', + playlist_id, 'Downloading page %d' % page, + data=json.dumps({ + 'query': '''{ + collection(xid: "%s") { + videos(first: %d, page: %d) { + pageInfo { + hasNextPage + nextPage + } + edges { + node { + xid + url + } + } + } + } +}''' % (playlist_id, self._PAGE_SIZE, page) + }).encode(), headers={ + 'Authorization': authorizaion, + 'Origin': 'https://www.dailymotion.com', + })['data']['collection']['videos'] + for edge in videos['edges']: + node = edge['node'] + yield self.url_result( + node['url'], DailymotionIE.ie_key(), node['xid']) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + api = self._parse_json(self._search_regex( + r'__PLAYER_CONFIG__\s*=\s*({.+?});', + webpage, 'player config'), playlist_id)['context']['api'] + auth = self._download_json( + api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'), + playlist_id, data=urlencode_postdata({ + 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'), + 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'), + 'grant_type': 'client_credentials', + })) + authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token']) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE) + return self.playlist_result( + entries, playlist_id, + self._og_search_title(webpage)) + + +class DailymotionUserIE(DailymotionBaseInfoExtractor): + IE_NAME = 'dailymotion:user' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P[^/]+)' + _MORE_PAGES_INDICATOR = r'(?s)
.*?[^/]+)' - _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' - _TESTS = [{ - 'url': 'https://www.dailymotion.com/user/nqtv', - 'info_dict': { - 'id': 'nqtv', - 'title': 'Rémi Gaillard', - }, - 'playlist_mincount': 100, - }, { - 'url': 'http://www.dailymotion.com/user/UnderProject', - 'info_dict': { - 'id': 'UnderProject', - 'title': 'UnderProject', - }, - 'playlist_mincount': 1800, - 'expected_warnings': [ - 'Stopped at duplicated page', - ], - 'skip': 'Takes too long time', - }] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 3a6d0560e..769a219df 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -5,13 +5,16 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, - unified_strdate, + int_or_none, + unified_timestamp, + url_or_none, ) class DctpTvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P[^/?#&]+)' - _TEST = { + _TESTS = [{ + # 4x3 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', @@ -19,37 +22,55 @@ class DctpTvIE(InfoExtractor): 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', - 'upload_date': '20110407', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 71.24, + 'timestamp': 1302172322, + 'upload_date': '20110407', }, 'params': { # rtmp download 'skip_download': True, }, - } + }, { + # 16x9 + 'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/', + 'only_matching': True, + }] + + _BASE_URL = 'http://dctp-ivms2-restapi.s3.amazonaws.com' def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + version = self._download_json( + '%s/version.json' % self._BASE_URL, display_id, + 'Downloading version JSON') - video_id = self._html_search_meta( - 'DC.identifier', webpage, 'video id', - default=None) or self._search_regex( - r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id') + restapi_base = '%s/%s/restapi' % ( + self._BASE_URL, version['version_name']) - title = self._og_search_title(webpage) + info = self._download_json( + '%s/slugs/%s.json' % (restapi_base, display_id), display_id, + 'Downloading video info JSON') + + media = self._download_json( + '%s/media/%s.json' % (restapi_base, compat_str(info['object_id'])), + display_id, 'Downloading media JSON') + + uuid = media['uuid'] + title = media['title'] + ratio = '16x9' if media.get('is_wide') else '4x3' + play_path = 'mp4:%s_dctp_0500_%s.m4v' % (uuid, ratio) servers = self._download_json( 'http://www.dctp.tv/streaming_servers/', display_id, - note='Downloading server list', fatal=False) + note='Downloading server list JSON', fatal=False) if servers: endpoint = next( server['endpoint'] for server in servers - if isinstance(server.get('endpoint'), compat_str) and + if url_or_none(server.get('endpoint')) and 'cloudfront' in server['endpoint']) else: endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' @@ -60,27 +81,35 @@ class DctpTvIE(InfoExtractor): formats = [{ 'url': endpoint, 'app': app, - 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, + 'play_path': play_path, 'page_url': url, - 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-110.swf', 'ext': 'flv', }] - description = self._html_search_meta('DC.description', webpage) - upload_date = unified_strdate( - self._html_search_meta('DC.date.created', webpage)) - thumbnail = self._og_search_thumbnail(webpage) - duration = float_or_none(self._search_regex( - r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', - default=None), scale=1000) + thumbnails = [] + images = media.get('images') + if isinstance(images, list): + for image in images: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) return { - 'id': video_id, - 'title': title, - 'formats': formats, + 'id': uuid, 'display_id': display_id, - 'description': description, - 'upload_date': upload_date, - 'thumbnail': thumbnail, - 'duration': duration, + 'title': title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description') or media.get('teaser'), + 'timestamp': unified_timestamp(media.get('created')), + 'duration': float_or_none(media.get('duration_in_ms'), scale=1000), + 'thumbnails': thumbnails, + 'formats': formats, } diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index 3368c4c07..9e7b14a7d 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, extract_attributes, @@ -12,6 +11,7 @@ from ..utils import ( parse_age_limit, remove_end, unescapeHTML, + url_or_none, ) @@ -69,9 +69,8 @@ class DiscoveryGoBaseIE(InfoExtractor): captions = stream.get('captions') if isinstance(captions, list): for caption in captions: - subtitle_url = caption.get('fileUrl') - if (not subtitle_url or not isinstance(subtitle_url, compat_str) or - not subtitle_url.startswith('http')): + subtitle_url = url_or_none(caption.get('fileUrl')) + if not subtitle_url or not subtitle_url.startswith('http'): continue lang = caption.get('fileLang', 'en') ext = determine_ext(subtitle_url) diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py index b6653784c..fba1ef221 100644 --- a/youtube_dl/extractor/discoverynetworks.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE +from .dplay import DPlayIE from ..compat import ( compat_parse_qs, compat_urlparse, @@ -12,8 +12,13 @@ from ..compat import ( from ..utils import smuggle_url -class DiscoveryNetworksDeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P\d+)|(?:[^/]+/)*videos/(?P[^/?#]+))' +class DiscoveryNetworksDeIE(DPlayIE): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>discovery|tlc|animalplanet|dmax)\.de/ + (?: + .*\#(?P<id>\d+)| + (?:[^/]+/)*videos/(?P<display_id>[^/?#]+)| + programme/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+) + )''' _TESTS = [{ 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', @@ -40,6 +45,14 @@ class DiscoveryNetworksDeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + alternate_id = mobj.group('alternate_id') + if alternate_id: + self._initialize_geo_bypass({ + 'countries': ['DE'], + }) + return self._get_disco_api_info( + url, '%s/%s' % (mobj.group('programme'), alternate_id), + 'sonic-eu1-prod.disco-api.com', mobj.group('site') + 'de') brightcove_id = mobj.group('id') if not brightcove_id: title = mobj.group('title') diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 8e0374320..ebf59512c 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -21,6 +21,7 @@ from ..utils import ( unified_strdate, unified_timestamp, update_url_query, + urljoin, USER_AGENTS, ) @@ -97,6 +98,75 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] + def _get_disco_api_info(self, url, display_id, disco_host, realm): + disco_base = 'https://' + disco_host + token = self._download_json( + '%s/token' % disco_base, display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + headers = { + 'Referer': url, + 'Authorization': 'Bearer ' + token, + } + video = self._download_json( + '%s/content/videos/%s' % (disco_base, display_id), display_id, + headers=headers, query={ + 'include': 'show' + }) + video_id = video['data']['id'] + info = video['data']['attributes'] + title = info['name'] + formats = [] + for format_id, format_dict in self._download_json( + '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), + display_id, headers=headers)['data']['attributes']['streaming'].items(): + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + if format_id == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, display_id, mpd_id='dash', fatal=False)) + elif format_id == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + series = None + try: + included = video.get('included') + if isinstance(included, list): + show = next(e for e in included if e.get('type') == 'show') + series = try_get( + show, lambda x: x['attributes']['name'], compat_str) + except StopIteration: + pass + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': info.get('description'), + 'duration': float_or_none( + info.get('videoDuration'), scale=1000), + 'timestamp': unified_timestamp(info.get('publishStart')), + 'series': series, + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode_number': int_or_none(info.get('episodeNumber')), + 'age_limit': int_or_none(info.get('minimum_age')), + 'formats': formats, + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') @@ -113,72 +183,8 @@ class DPlayIE(InfoExtractor): if not video_id: host = mobj.group('host') - disco_base = 'https://disco-api.%s' % host - self._download_json( - '%s/token' % disco_base, display_id, 'Downloading token', - query={ - 'realm': host.replace('.', ''), - }) - video = self._download_json( - '%s/content/videos/%s' % (disco_base, display_id), display_id, - headers={ - 'Referer': url, - 'x-disco-client': 'WEB:UNKNOWN:dplay-client:0.0.1', - }, query={ - 'include': 'show' - }) - video_id = video['data']['id'] - info = video['data']['attributes'] - title = info['name'] - formats = [] - for format_id, format_dict in self._download_json( - '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), - display_id)['data']['attributes']['streaming'].items(): - if not isinstance(format_dict, dict): - continue - format_url = format_dict.get('url') - if not format_url: - continue - ext = determine_ext(format_url) - if format_id == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id='dash', fatal=False)) - elif format_id == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - series = None - try: - included = video.get('included') - if isinstance(included, list): - show = next(e for e in included if e.get('type') == 'show') - series = try_get( - show, lambda x: x['attributes']['name'], compat_str) - except StopIteration: - pass - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': info.get('description'), - 'duration': float_or_none( - info.get('videoDuration'), scale=1000), - 'timestamp': unified_timestamp(info.get('publishStart')), - 'series': series, - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode_number': int_or_none(info.get('episodeNumber')), - 'age_limit': int_or_none(info.get('minimum_age')), - 'formats': formats, - } + return self._get_disco_api_info( + url, display_id, 'disco-api.' + host, host.replace('.', '')) info = self._download_json( 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), @@ -305,9 +311,11 @@ class DPlayItIE(InfoExtractor): if not info: info_url = self._search_regex( - r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', - webpage, 'info url') + (r'playback_json_url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + r'url\s*[:=]\s*["\'](?P<url>(?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)'), + webpage, 'info url', group='url') + info_url = urljoin(url, info_url) video_id = info_url.rpartition('/')[-1] try: @@ -317,6 +325,8 @@ class DPlayItIE(InfoExtractor): 'dplayit_token').value, 'Referer': url, }) + if isinstance(info, compat_str): + info = self._parse_json(info, display_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): info = self._parse_json(e.cause.read().decode('utf-8'), display_id) @@ -332,6 +342,7 @@ class DPlayItIE(InfoExtractor): formats = self._extract_m3u8_formats( hls_url, display_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) series = self._html_search_regex( r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>', diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index ffbd2623d..db1de699f 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -7,7 +7,6 @@ import json from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_str, compat_urlparse, ) from ..utils import ( @@ -17,6 +16,7 @@ from ..utils import ( parse_age_limit, parse_duration, unified_timestamp, + url_or_none, ) @@ -42,7 +42,7 @@ class DramaFeverBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -139,8 +139,8 @@ class DramaFeverIE(DramaFeverBaseIE): for sub in subs: if not isinstance(sub, dict): continue - sub_url = sub.get('url') - if not sub_url or not isinstance(sub_url, compat_str): + sub_url = url_or_none(sub.get('url')) + if not sub_url: continue subtitles.setdefault( sub.get('code') or sub.get('language') or 'en', []).append({ @@ -163,8 +163,8 @@ class DramaFeverIE(DramaFeverBaseIE): for format_id, format_dict in download_assets.items(): if not isinstance(format_dict, dict): continue - format_url = format_dict.get('url') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_dict.get('url')) + if not format_url: continue formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index f138025d5..8d31258c1 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -8,7 +8,6 @@ from ..utils import ( unified_strdate, xpath_text, determine_ext, - qualities, float_or_none, ExtractorError, ) @@ -16,7 +15,8 @@ from ..utils import ( class DreiSatIE(InfoExtractor): IE_NAME = '3sat' - _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' + _GEO_COUNTRIES = ['DE'] + _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)' _TESTS = [ { 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', @@ -43,7 +43,8 @@ class DreiSatIE(InfoExtractor): def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): param_groups = {} for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace')) + group_id = param_group.get(self._xpath_ns( + 'id', 'http://www.w3.org/XML/1998/namespace')) params = {} for param in param_group: params[param.get('name')] = param.get('value') @@ -54,7 +55,7 @@ class DreiSatIE(InfoExtractor): src = video.get('src') if not src: continue - bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) group_id = video.get('paramGroup') param_group = param_groups[group_id] for proto in param_group['protocols'].split(','): @@ -75,66 +76,36 @@ class DreiSatIE(InfoExtractor): note='Downloading video info', errnote='Failed to download video info') - status_code = doc.find('./status/statuscode') - if status_code is not None and status_code.text != 'ok': - code = status_code.text - if code == 'notVisibleAnymore': + status_code = xpath_text(doc, './status/statuscode') + if status_code and status_code != 'ok': + if status_code == 'notVisibleAnymore': message = 'Video %s is not available' % video_id else: - message = '%s returned error: %s' % (self.IE_NAME, code) + message = '%s returned error: %s' % (self.IE_NAME, status_code) raise ExtractorError(message, expected=True) - title = doc.find('.//information/title').text - description = xpath_text(doc, './/information/detail', 'description') - duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) - uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') - uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') - upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) + title = xpath_text(doc, './/information/title', 'title', True) - def xml_to_thumbnails(fnode): - thumbnails = [] - for node in fnode: - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - if 'key' in node.attrib: - m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - return thumbnails - - thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) - - format_nodes = doc.findall('.//formitaeten/formitaet') - quality = qualities(['veryhigh', 'high', 'med', 'low']) - - def get_quality(elem): - return quality(xpath_text(elem, 'quality')) - format_nodes.sort(key=get_quality) - format_ids = [] + urls = [] formats = [] - for fnode in format_nodes: - video_url = fnode.find('url').text - is_available = 'http://www.metafilegenerator' not in video_url - if not is_available: + for fnode in doc.findall('.//formitaeten/formitaet'): + video_url = xpath_text(fnode, 'url') + if not video_url or video_url in urls: continue + urls.append(video_url) + + is_available = 'http://www.metafilegenerator' not in video_url + geoloced = 'static_geoloced_online' in video_url + if not is_available or geoloced: + continue + format_id = fnode.attrib['basetype'] - quality = xpath_text(fnode, './quality', 'quality') format_m = re.match(r'''(?x) (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) ''', format_id) ext = determine_ext(video_url, None) or format_m.group('container') - if ext not in ('smil', 'f4m', 'm3u8'): - format_id = format_id + '-' + quality - if format_id in format_ids: - continue if ext == 'meta': continue @@ -147,24 +118,23 @@ class DreiSatIE(InfoExtractor): if video_url.startswith('https://'): continue formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id=format_id, fatal=False)) else: - proto = format_m.group('proto').lower() + quality = xpath_text(fnode, './quality') + if quality: + format_id += '-' + quality - abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) + abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - width = int_or_none(xpath_text(fnode, './width', 'width')) - height = int_or_none(xpath_text(fnode, './height', 'height')) - - filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) - - format_note = '' - if not format_note: - format_note = None + tbr = int_or_none(self._search_regex( + r'_(\d+)k', video_url, 'bitrate', None)) + if tbr and vbr and not abr: + abr = tbr - vbr formats.append({ 'format_id': format_id, @@ -174,31 +144,50 @@ class DreiSatIE(InfoExtractor): 'vcodec': format_m.group('vcodec'), 'abr': abr, 'vbr': vbr, - 'width': width, - 'height': height, - 'filesize': filesize, - 'format_note': format_note, - 'protocol': proto, - '_available': is_available, + 'tbr': tbr, + 'width': int_or_none(xpath_text(fnode, './width')), + 'height': int_or_none(xpath_text(fnode, './height')), + 'filesize': int_or_none(xpath_text(fnode, './filesize')), + 'protocol': format_m.group('proto').lower(), }) - format_ids.append(format_id) + + geolocation = xpath_text(doc, './/details/geolocation') + if not formats and geolocation and geolocation != 'none': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) self._sort_formats(formats) + thumbnails = [] + for node in doc.findall('.//teaserimages/teaserimage'): + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + thumbnail_key = node.get('key') + if thumbnail_key: + m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + + upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) + return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, + 'description': xpath_text(doc, './/information/detail'), + 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'uploader': xpath_text(doc, './/details/originChannelTitle'), + 'uploader_id': xpath_text(doc, './/details/originChannelId'), 'upload_date': upload_date, 'formats': formats, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + video_id = self._match_id(url) + details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id return self.extract_from_xml_url(video_id, details_url) diff --git a/youtube_dl/extractor/dtube.py b/youtube_dl/extractor/dtube.py new file mode 100644 index 000000000..4ca97f860 --- /dev/null +++ b/youtube_dl/extractor/dtube.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re +from socket import timeout + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class DTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P<uploader_id>[0-9a-z.-]+)/(?P<id>[0-9a-z]{8})' + _TEST = { + 'url': 'https://d.tube/#!/v/benswann/zqd630em', + 'md5': 'a03eaa186618ffa7a3145945543a251e', + 'info_dict': { + 'id': 'zqd630em', + 'ext': 'mp4', + 'title': 'Reality Check: FDA\'s Disinformation Campaign on Kratom', + 'description': 'md5:700d164e066b87f9eac057949e4227c2', + 'uploader_id': 'benswann', + 'upload_date': '20180222', + 'timestamp': 1519328958, + }, + 'params': { + 'format': '480p', + }, + } + + def _real_extract(self, url): + uploader_id, video_id = re.match(self._VALID_URL, url).groups() + result = self._download_json('https://api.steemit.com/', video_id, data=json.dumps({ + 'jsonrpc': '2.0', + 'method': 'get_content', + 'params': [uploader_id, video_id], + }).encode())['result'] + + metadata = json.loads(result['json_metadata']) + video = metadata['video'] + content = video['content'] + info = video.get('info', {}) + title = info.get('title') or result['title'] + + def canonical_url(h): + if not h: + return None + return 'https://ipfs.io/ipfs/' + h + + formats = [] + for q in ('240', '480', '720', '1080', ''): + video_url = canonical_url(content.get('video%shash' % q)) + if not video_url: + continue + format_id = (q + 'p') if q else 'Source' + try: + self.to_screen('%s: Checking %s video format URL' % (video_id, format_id)) + self._downloader._opener.open(video_url, timeout=5).close() + except timeout as e: + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, format_id)) + continue + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'height': int_or_none(q), + 'ext': 'mp4', + }) + + return { + 'id': video_id, + 'title': title, + 'description': content.get('description'), + 'thumbnail': canonical_url(info.get('snaphash')), + 'tags': content.get('tags') or metadata.get('tags'), + 'duration': info.get('duration'), + 'formats': formats, + 'timestamp': parse_iso8601(result.get('created')), + 'uploader_id': uploader_id, + } diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index 3f760888e..20996962a 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -91,17 +91,6 @@ class DVTVIE(InfoExtractor): }, { 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/', 'only_matching': True, - }, { - 'url': 'https://video.aktualne.cz/dvtv/babis-a-zeman-nesou-vinu-za-to-ze-nemame-jasno-v-tom-kdo-bud/r~026afb54fad711e79704ac1f6b220ee8/', - 'md5': '87defe16681b1429c91f7a74809823c6', - 'info_dict': { - 'id': 'f5ae72f6fad611e794dbac1f6b220ee8', - 'ext': 'mp4', - 'title': 'Babiš a Zeman nesou vinu za to, že nemáme jasno v tom, kdo bude vládnout, říká Pekarová Adamová', - }, - 'params': { - 'skip_download': True, - }, }] def _parse_video_metadata(self, js, video_id, live_js=None): diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 42789278e..36fef07b7 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -4,14 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, unsmuggle_url, + url_or_none, ) @@ -177,7 +175,7 @@ class EaglePlatformIE(InfoExtractor): video_id, 'Downloading mp4 JSON', fatal=False) if mp4_data: for format_id, format_url in mp4_data.get('data', {}).items(): - if not isinstance(format_url, compat_str): + if not url_or_none(format_url): continue height = int_or_none(format_id) if height is not None and m3u8_formats_dict.get(height): diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index edabaafe6..df11dc206 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, try_get, unified_timestamp, + url_or_none, ) @@ -34,8 +35,8 @@ class EggheadCourseIE(InfoExtractor): entries = [] for lesson in lessons: - lesson_url = lesson.get('http_url') - if not lesson_url or not isinstance(lesson_url, compat_str): + lesson_url = url_or_none(lesson.get('http_url')) + if not lesson_url: continue lesson_id = lesson.get('id') if lesson_id: @@ -95,7 +96,8 @@ class EggheadLessonIE(InfoExtractor): formats = [] for _, format_url in lesson['media_urls'].items(): - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue ext = determine_ext(format_url) if ext == 'm3u8': diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 81f2e2ee1..6d03d7095 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -11,6 +11,7 @@ from ..utils import ( int_or_none, parse_duration, str_to_int, + url_or_none, ) @@ -82,8 +83,8 @@ class EpornerIE(InfoExtractor): for format_id, format_dict in formats_dict.items(): if not isinstance(format_dict, dict): continue - src = format_dict.get('src') - if not isinstance(src, compat_str) or not src.startswith('http'): + src = url_or_none(format_dict.get('src')) + if not src or not src.startswith('http'): continue if kind == 'hls': formats.extend(self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/expressen.py b/youtube_dl/extractor/expressen.py new file mode 100644 index 000000000..934571472 --- /dev/null +++ b/youtube_dl/extractor/expressen.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + unescapeHTML, + unified_timestamp, +) + + +class ExpressenIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?expressen\.se/ + (?:(?:tvspelare/video|videoplayer/embed)/)? + tv/(?:[^/]+/)* + (?P<id>[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', + 'md5': '2fbbe3ca14392a6b1b36941858d33a45', + 'info_dict': { + 'id': '8690962', + 'ext': 'mp4', + 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden', + 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 788, + 'timestamp': 1526639109, + 'upload_date': '20180518', + }, + }, { + 'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/', + 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/tvspelare/video/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', + 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/videoplayer/embed/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') for mobj in re.finditer( + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?expressen\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1', + webpage)] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + def extract_data(name): + return self._parse_json( + self._search_regex( + r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name, + webpage, 'info', group='value'), + display_id, transform_source=unescapeHTML) + + info = extract_data('video-tracking-info') + video_id = info['videoId'] + + data = extract_data('article-data') + stream = data['stream'] + + if determine_ext(stream) == 'm3u8': + formats = self._extract_m3u8_formats( + stream, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + formats = [{ + 'url': stream, + }] + self._sort_formats(formats) + + title = info.get('titleRaw') or data['title'] + description = info.get('descriptionRaw') + thumbnail = info.get('socialMediaImage') or data.get('image') + duration = int_or_none(info.get('videoTotalSecondsDuration') or + data.get('totalSecondsDuration')) + timestamp = unified_timestamp(info.get('publishDate')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f03f98a6c..23ad42da9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -44,6 +44,7 @@ from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import ( @@ -117,6 +118,10 @@ from .bilibili import ( BiliBiliBangumiIE, ) from .biobiochiletv import BioBioChileTVIE +from .bitchute import ( + BitChuteIE, + BitChuteChannelIE, +) from .biqle import BIQLEIE from .bleacherreport import ( BleacherReportIE, @@ -145,6 +150,8 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) +from .cammodels import CamModelsIE +from .camtube import CamTubeIE from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE @@ -283,6 +290,7 @@ from .drtv import ( DRTVIE, DRTVLiveIE, ) +from .dtube import DTubeIE from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE @@ -331,6 +339,7 @@ from .esri import EsriVideoIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE +from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE from .facebook import ( @@ -368,7 +377,6 @@ from .foxgay import FoxgayIE from .foxnews import ( FoxNewsIE, FoxNewsArticleIE, - FoxNewsInsiderIE, ) from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE @@ -378,6 +386,7 @@ from .francetv import ( FranceTVSiteIE, FranceTVEmbedIE, FranceTVInfoIE, + FranceTVInfoSportIE, FranceTVJeunesseIE, GenerationWhatIE, CultureboxIE, @@ -385,6 +394,11 @@ from .francetv import ( from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE +from .frontendmasters import ( + FrontendMastersIE, + FrontendMastersLessonIE, + FrontendMastersCourseIE +) from .funimation import FunimationIE from .funk import ( FunkMixIE, @@ -468,10 +482,7 @@ from .imgur import ( ) from .ina import InaIE from .inc import IncIE -from .indavideo import ( - IndavideoIE, - IndavideoEmbedIE, -) +from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internazionale import InternazionaleIE @@ -581,13 +592,16 @@ from .mailru import ( MailRuMusicIE, MailRuMusicSearchIE, ) -from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE @@ -624,7 +638,6 @@ from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE -from .moniker import MonikerIE from .morningstar import MorningstarIE from .motherless import ( MotherlessIE, @@ -645,6 +658,7 @@ from .mtv import ( from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE from .mwave import MwaveIE, MwaveMeetGreetIE +from .mychannels import MyChannelsIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvi import ( @@ -666,6 +680,7 @@ from .nbc import ( NBCOlympicsIE, NBCOlympicsStreamIE, NBCSportsIE, + NBCSportsStreamIE, NBCSportsVPlayerIE, ) from .ndr import ( @@ -705,12 +720,7 @@ from .nexx import ( from .nfb import NFBIE from .nfl import NFLIE from .nhk import NhkVodIE -from .nhl import ( - NHLVideocenterIE, - NHLNewsIE, - NHLVideocenterCategoryIE, - NHLIE, -) +from .nhl import NHLIE from .nick import ( NickIE, NickBrIE, @@ -719,10 +729,7 @@ from .nick import ( NickRuIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninecninemedia import ( - NineCNineMediaStackIE, - NineCNineMediaIE, -) +from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE @@ -765,7 +772,9 @@ from .nrk import ( NRKSkoleIE, NRKTVIE, NRKTVDirekteIE, + NRKTVEpisodeIE, NRKTVEpisodesIE, + NRKTVSeasonIE, NRKTVSeriesIE, ) from .ntvde import NTVDeIE @@ -810,6 +819,7 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .pearvideo import PearVideoIE +from .peertube import PeerTubeIE from .people import PeopleIE from .performgroup import PerformGroupIE from .periscope import ( @@ -854,6 +864,10 @@ from .pornhub import ( from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE +from .puhutv import ( + PuhuTVIE, + PuhuTVSerieIE, +) from .presstv import PressTVIE from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE @@ -885,7 +899,10 @@ from .rai import ( RaiPlayPlaylistIE, RaiIE, ) -from .raywenderlich import RayWenderlichIE +from .raywenderlich import ( + RayWenderlichIE, + RayWenderlichCourseIE, +) from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import RedBullTVIE @@ -1015,7 +1032,10 @@ from .spankbang import SpankBangIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE -from .spike import SpikeIE +from .spike import ( + BellatorIE, + ParamountNetworkIE, +) from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxEmbedIE @@ -1038,6 +1058,7 @@ from .stretchinternet import StretchInternetIE from .sunporno import SunPornoIE from .svt import ( SVTIE, + SVTPageIE, SVTPlayIE, SVTSeriesIE, ) @@ -1141,6 +1162,7 @@ from .tvc import ( from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE +from .tvnet import TVNetIE from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, @@ -1276,6 +1298,7 @@ from .viki import ( VikiIE, VikiChannelIE, ) +from .viqeo import ViqeoIE from .viu import ( ViuIE, ViuPlaylistIE, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 220ada3a6..97cfe0fc3 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -20,6 +20,7 @@ from ..utils import ( int_or_none, js_to_json, limit_length, + parse_count, sanitized_Request, try_get, urlencode_postdata, @@ -56,6 +57,7 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true' _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', @@ -74,7 +76,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif Nawab Butt posted a video to his Timeline.', + 'title': 're:^Asif Nawab Butt posted a video', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, @@ -132,7 +134,7 @@ class FacebookIE(InfoExtractor): }, { # have 1080P, but only up to 720p in swf params 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '0d9813160b146b3bc8744e006027fcc6', + 'md5': '9571fae53d4165bbbadb17a94651dcdc', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -141,6 +143,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20161030', 'uploader': 'CNN', 'thumbnail': r're:^https?://.*', + 'view_count': int, }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -148,7 +151,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'md5:a7b86ca673f51800cd54687b7f4012fe', + 'title': 'md5:1db063d6a8c13faa8da727817339c857', 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', @@ -175,7 +178,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1396382447100162', 'ext': 'mp4', - 'title': 'md5:e2d2700afdf84e121f5d0f999bad13a3', + 'title': 'md5:19a428bbde91364e3de815383b54a235', 'timestamp': 1486035494, 'upload_date': '20170202', 'uploader': 'Elisabeth Ahtn', @@ -208,6 +211,17 @@ class FacebookIE(InfoExtractor): # no title 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', + 'info_dict': { + 'id': '359649331226507', + 'ext': 'mp4', + 'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', + 'uploader': 'ESL One Dota 2', + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -226,7 +240,7 @@ class FacebookIE(InfoExtractor): return urls def _login(self): - (useremail, password) = self._get_login_info() + useremail, password = self._get_login_info() if useremail is None: return @@ -312,16 +326,18 @@ class FacebookIE(InfoExtractor): if server_js_data: video_data = extract_video_data(server_js_data.get('instances', [])) + def extract_from_jsmods_instances(js_data): + if js_data: + return extract_video_data(try_get( + js_data, lambda x: x['jsmods']['instances'], list) or []) + if not video_data: server_js_data = self._parse_json( self._search_regex( r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', webpage, 'js data', default='{}'), video_id, transform_source=js_to_json, fatal=False) - if server_js_data: - video_data = extract_video_data(try_get( - server_js_data, lambda x: x['jsmods']['instances'], - list) or []) + video_data = extract_from_jsmods_instances(server_js_data) if not video_data: if not fatal_if_no_video: @@ -333,8 +349,35 @@ class FacebookIE(InfoExtractor): expected=True) elif '>You must log in to continue' in webpage: self.raise_login_required() - else: - raise ExtractorError('Cannot parse data') + + # Video info not in first request, do a secondary request using + # tahoe player specific URL + tahoe_data = self._download_webpage( + self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + data=urlencode_postdata({ + '__a': 1, + '__pc': self._search_regex( + r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, + 'pkg cohort', default='PHASED:DEFAULT'), + '__rev': self._search_regex( + r'client_revision["\']\s*:\s*(\d+),', webpage, + 'client revision', default='3944515'), + 'fb_dtsg': self._search_regex( + r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', + webpage, 'dtsg token', default=''), + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + tahoe_js_data = self._parse_json( + self._search_regex( + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + 'tahoe js data', default='{}'), + video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) + + if not video_data: + raise ExtractorError('Cannot parse data') formats = [] for f in video_data: @@ -380,12 +423,17 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id uploader = clean_html(get_element_by_id( 'fbPhotoPageAuthorName', webpage)) or self._search_regex( - r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', + fatal=False) or self._og_search_title(webpage, fatal=False) timestamp = int_or_none(self._search_regex( r'<abbr[^>]+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) thumbnail = self._og_search_thumbnail(webpage) + view_count = parse_count(self._search_regex( + r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', + default=None)) + info_dict = { 'id': video_id, 'title': video_title, @@ -393,6 +441,7 @@ class FacebookIE(InfoExtractor): 'uploader': uploader, 'timestamp': timestamp, 'thumbnail': thumbnail, + 'view_count': view_count, } return webpage, info_dict diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 448647d72..435561147 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -46,7 +46,7 @@ class FC2IE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None or password is None: return False diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 4803a22c8..28617d83c 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, qualities, unified_strdate, + url_or_none, ) @@ -88,8 +89,8 @@ class FirstTVIE(InfoExtractor): formats = [] path = None for f in item.get('mbr', []): - src = f.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(f.get('src')) + if not src: continue tbr = int_or_none(self._search_regex( r'_(\d{3,})\.mp4', src, 'tbr', default=None)) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index dc0662f74..63613cb85 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -58,6 +58,14 @@ class FoxNewsIE(AMPIE): }, ] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', + webpage)] + def _real_extract(self, url): host, video_id = re.match(self._VALID_URL, url).groups() @@ -68,21 +76,41 @@ class FoxNewsIE(AMPIE): class FoxNewsArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' - _TEST = { + _TESTS = [{ + # data-video-id 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '62aa5a781b308fdee212ebb6f33ae7ef', + 'md5': '83d44e1aff1433e7a29a7b537d1700b5', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', 'description': 'Veterans react on \'The Kelly File\'', - 'timestamp': 1473299755, + 'timestamp': 1473301045, 'upload_date': '20160908', }, - } + }, { + # iframe embed + 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'info_dict': { + 'id': '5748266721001', + 'ext': 'flv', + 'title': 'Kyle Kashuv has a positive message for the Trump White House', + 'description': 'Marjory Stoneman Douglas student disagrees with classmates.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 229, + 'timestamp': 1520594670, + 'upload_date': '20180309', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -90,51 +118,10 @@ class FoxNewsArticleIE(InfoExtractor): video_id = self._html_search_regex( r'data-video-id=([\'"])(?P<id>[^\'"]+)\1', - webpage, 'video ID', group='id') + webpage, 'video ID', group='id', default=None) + if video_id: + return self.url_result( + 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) + return self.url_result( - 'http://video.foxnews.com/v/' + video_id, - FoxNewsIE.ie_key()) - - -class FoxNewsInsiderIE(InfoExtractor): - _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P<id>[a-z-]+)' - IE_NAME = 'foxnews:insider' - - _TEST = { - 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', - 'md5': 'a10c755e582d28120c62749b4feb4c0c', - 'info_dict': { - 'id': '5099377331001', - 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words', - 'ext': 'mp4', - 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive', - 'description': 'Is campus censorship getting out of control?', - 'timestamp': 1472168725, - 'upload_date': '20160825', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': [FoxNewsIE.ie_key()], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL') - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - - return { - '_type': 'url_transparent', - 'ie_key': FoxNewsIE.ie_key(), - 'url': embed_url, - 'display_id': display_id, - 'title': title, - 'description': description, - } + FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index c02cd03de..2ffe83a78 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -16,6 +16,7 @@ from ..utils import ( int_or_none, parse_duration, try_get, + url_or_none, ) from .dailymotion import DailymotionIE @@ -115,14 +116,13 @@ class FranceTVIE(InfoExtractor): def sign(manifest_url, manifest_id): for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): - signed_url = self._download_webpage( + signed_url = url_or_none(self._download_webpage( 'https://%s/esi/TA' % host, video_id, 'Downloading signed %s manifest URL' % manifest_id, fatal=False, query={ 'url': manifest_url, - }) - if (signed_url and isinstance(signed_url, compat_str) and - re.search(r'^(?:https?:)?//', signed_url)): + })) + if signed_url: return signed_url return manifest_url @@ -379,6 +379,31 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): return self._make_url_result(video_id, catalogue) +class FranceTVInfoSportIE(FranceTVBaseInfoExtractor): + IE_NAME = 'sport.francetvinfo.fr' + _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018', + 'info_dict': { + 'id': '6e49080e-3f45-11e8-b459-000d3a2439ea', + 'ext': 'mp4', + 'title': 'Retour sur les meilleurs moments de Pyeongchang 2018', + 'timestamp': 1523639962, + 'upload_date': '20180413', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id') + return self._make_url_result(video_id, 'Sport-web') + + class GenerationWhatIE(InfoExtractor): IE_NAME = 'france2.fr:generation-what' _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)' diff --git a/youtube_dl/extractor/frontendmasters.py b/youtube_dl/extractor/frontendmasters.py new file mode 100644 index 000000000..cb57ba007 --- /dev/null +++ b/youtube_dl/extractor/frontendmasters.py @@ -0,0 +1,263 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_duration, + url_or_none, + urlencode_postdata, +) + + +class FrontendMastersBaseIE(InfoExtractor): + _API_BASE = 'https://api.frontendmasters.com/v1/kabuki' + _LOGIN_URL = 'https://frontendmasters.com/login/' + + _NETRC_MACHINE = 'frontendmasters' + + _QUALITIES = { + 'low': {'width': 480, 'height': 360}, + 'mid': {'width': 1280, 'height': 720}, + 'high': {'width': 1920, 'height': 1080} + } + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post_url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + # Successful login + if any(p in response for p in ( + 'wp-login.php?action=logout', '>Logout')): + return + + error = self._html_search_regex( + r'class=(["\'])(?:(?!\1).)*\bMessageAlert\b(?:(?!\1).)*\1[^>]*>(?P<error>[^<]+)<', + response, 'error message', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class FrontendMastersPageBaseIE(FrontendMastersBaseIE): + def _download_course(self, course_name, url): + return self._download_json( + '%s/courses/%s' % (self._API_BASE, course_name), course_name, + 'Downloading course JSON', headers={'Referer': url}) + + @staticmethod + def _extract_chapters(course): + chapters = [] + lesson_elements = course.get('lessonElements') + if isinstance(lesson_elements, list): + chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)] + return chapters + + @staticmethod + def _extract_lesson(chapters, lesson_id, lesson): + title = lesson.get('title') or lesson_id + display_id = lesson.get('slug') + description = lesson.get('description') + thumbnail = lesson.get('thumbnail') + + chapter_number = None + index = lesson.get('index') + element_index = lesson.get('elementIndex') + if (isinstance(index, int) and isinstance(element_index, int) and + index < element_index): + chapter_number = element_index - index + chapter = (chapters[chapter_number - 1] + if chapter_number - 1 < len(chapters) else None) + + duration = None + timestamp = lesson.get('timestamp') + if isinstance(timestamp, compat_str): + mobj = re.search( + r'(?P<start>\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P<end>\s*\d{1,2}:\d{1,2}:\d{1,2})', + timestamp) + if mobj: + duration = parse_duration(mobj.group('end')) - parse_duration( + mobj.group('start')) + + return { + '_type': 'url_transparent', + 'url': 'frontendmasters:%s' % lesson_id, + 'ie_key': FrontendMastersIE.ie_key(), + 'id': lesson_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'chapter': chapter, + 'chapter_number': chapter_number, + } + + +class FrontendMastersIE(FrontendMastersBaseIE): + _VALID_URL = r'(?:frontendmasters:|https?://api\.frontendmasters\.com/v\d+/kabuki/video/)(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://api.frontendmasters.com/v1/kabuki/video/a2qogef6ba', + 'md5': '7f161159710d6b7016a4f4af6fcb05e2', + 'info_dict': { + 'id': 'a2qogef6ba', + 'ext': 'mp4', + 'title': 'a2qogef6ba', + }, + 'skip': 'Requires FrontendMasters account credentials', + }, { + 'url': 'frontendmasters:a2qogef6ba', + 'only_matching': True, + }] + + def _real_extract(self, url): + lesson_id = self._match_id(url) + + source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id) + + formats = [] + for ext in ('webm', 'mp4'): + for quality in ('low', 'mid', 'high'): + resolution = self._QUALITIES[quality].copy() + format_id = '%s-%s' % (ext, quality) + format_url = self._download_json( + source_url, lesson_id, + 'Downloading %s source JSON' % format_id, query={ + 'f': ext, + 'r': resolution['height'], + }, headers={ + 'Referer': url, + }, fatal=False)['url'] + + if not format_url: + continue + + f = resolution.copy() + f.update({ + 'url': format_url, + 'ext': ext, + 'format_id': format_id, + }) + formats.append(f) + self._sort_formats(formats) + + subtitles = { + 'en': [{ + 'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id), + }] + } + + return { + 'id': lesson_id, + 'title': lesson_id, + 'formats': formats, + 'subtitles': subtitles + } + + +class FrontendMastersLessonIE(FrontendMastersPageBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<course_name>[^/]+)/(?P<lesson_name>[^/]+)' + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/tools', + 'info_dict': { + 'id': 'a2qogef6ba', + 'display_id': 'tools', + 'ext': 'mp4', + 'title': 'Tools', + 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'chapter': 'Introduction', + 'chapter_number': 1, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires FrontendMasters account credentials', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_name, lesson_name = mobj.group('course_name', 'lesson_name') + + course = self._download_course(course_name, url) + + lesson_id, lesson = next( + (video_id, data) + for video_id, data in course['lessonData'].items() + if data.get('slug') == lesson_name) + + chapters = self._extract_chapters(course) + return self._extract_lesson(chapters, lesson_id, lesson) + + +class FrontendMastersCourseIE(FrontendMastersPageBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<id>[^/]+)' + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/', + 'info_dict': { + 'id': 'web-development', + 'title': 'Introduction to Web Development', + 'description': 'md5:9317e6e842098bf725d62360e52d49a6', + }, + 'playlist_count': 81, + 'skip': 'Requires FrontendMasters account credentials', + } + + @classmethod + def suitable(cls, url): + return False if FrontendMastersLessonIE.suitable(url) else super( + FrontendMastersBaseIE, cls).suitable(url) + + def _real_extract(self, url): + course_name = self._match_id(url) + + course = self._download_course(course_name, url) + + chapters = self._extract_chapters(course) + + lessons = sorted( + course['lessonData'].values(), key=lambda data: data['index']) + + entries = [] + for lesson in lessons: + lesson_name = lesson.get('slug') + if not lesson_name: + continue + lesson_id = lesson.get('hash') or lesson.get('statsId') + entries.append(self._extract_lesson(chapters, lesson_id, lesson)) + + title = course.get('title') + description = course.get('description') + + return self.playlist_result(entries, course_name, title, description) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 107f658ba..07d01caec 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -51,7 +51,7 @@ class FunimationIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return try: diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py index 0ff058619..7e1af95e0 100644 --- a/youtube_dl/extractor/funk.py +++ b/youtube_dl/extractor/funk.py @@ -1,10 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor from .nexx import NexxIE +from ..compat import compat_str from ..utils import ( int_or_none, try_get, @@ -12,6 +14,19 @@ from ..utils import ( class FunkBaseIE(InfoExtractor): + _HEADERS = { + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8', + 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4', + } + _AUTH = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4' + + @staticmethod + def _make_headers(referer): + headers = FunkBaseIE._HEADERS.copy() + headers['Referer'] = referer + return headers + def _make_url_result(self, video): return { '_type': 'url_transparent', @@ -48,19 +63,19 @@ class FunkMixIE(FunkBaseIE): lists = self._download_json( 'https://www.funk.net/api/v3.1/curation/curatedLists/', - mix_id, headers={ - 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbC12Mi4wIiwic2NvcGUiOiJzdGF0aWMtY29udGVudC1hcGksY3VyYXRpb24tc2VydmljZSxzZWFyY2gtYXBpIn0.SGCC1IXHLtZYoo8PvRKlU2gXH1su8YSu47sB3S4iXBI', - 'Referer': url, - }, query={ + mix_id, headers=self._make_headers(url), query={ 'size': 100, - })['result']['lists'] + })['_embedded']['curatedListList'] metas = next( l for l in lists if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas'] video = next( meta['videoDataDelegate'] - for meta in metas if meta.get('alias') == alias) + for meta in metas + if try_get( + meta, lambda x: x['videoDataDelegate']['alias'], + compat_str) == alias) return self._make_url_result(video) @@ -104,25 +119,53 @@ class FunkChannelIE(FunkBaseIE): channel_id = mobj.group('id') alias = mobj.group('alias') - headers = { - 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc', - 'Referer': url, - } + headers = self._make_headers(url) video = None - by_id_list = self._download_json( - 'https://www.funk.net/api/v3.0/content/videos/byIdList', channel_id, - headers=headers, query={ - 'ids': alias, - }, fatal=False) - if by_id_list: - video = try_get(by_id_list, lambda x: x['result'][0], dict) + # Id-based channels are currently broken on their side: webplayer + # tries to process them via byChannelAlias endpoint and fails + # predictably. + for page_num in itertools.count(): + by_channel_alias = self._download_json( + 'https://www.funk.net/api/v3.1/webapp/videos/byChannelAlias/%s' + % channel_id, + 'Downloading byChannelAlias JSON page %d' % (page_num + 1), + headers=headers, query={ + 'filterFsk': 'false', + 'sort': 'creationDate,desc', + 'size': 100, + 'page': page_num, + }, fatal=False) + if not by_channel_alias: + break + video_list = try_get( + by_channel_alias, lambda x: x['_embedded']['videoList'], list) + if not video_list: + break + try: + video = next(r for r in video_list if r.get('alias') == alias) + break + except StopIteration: + pass + if not try_get( + by_channel_alias, lambda x: x['_links']['next']): + break + + if not video: + by_id_list = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/byIdList', + channel_id, 'Downloading byIdList JSON', headers=headers, + query={ + 'ids': alias, + }, fatal=False) + if by_id_list: + video = try_get(by_id_list, lambda x: x['result'][0], dict) if not video: results = self._download_json( - 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, - headers=headers, query={ + 'https://www.funk.net/api/v3.0/content/videos/filter', + channel_id, 'Downloading filter JSON', headers=headers, query={ 'channelId': channel_id, 'size': 100, })['result'] diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index f71d9092e..8806dc48a 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -91,7 +91,7 @@ class GDCVaultIE(InfoExtractor): ] def _login(self, webpage_url, display_id): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None or password is None: self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') return None diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 76852f9dc..83a31f3d3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -32,6 +32,7 @@ from ..utils import ( unified_strdate, unsmuggle_url, UnsupportedError, + url_or_none, xpath_text, ) from .commonprotocols import RtmpIE @@ -108,6 +109,12 @@ from .yapfiles import YapFilesIE from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE +from .peertube import PeerTubeIE +from .indavideo import IndavideoEmbedIE +from .apa import APAIE +from .foxnews import FoxNewsIE +from .viqeo import ViqeoIE +from .expressen import ExpressenIE class GenericIE(InfoExtractor): @@ -1391,17 +1398,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - # SVT embed - { - 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', - 'info_dict': { - 'id': '2900353', - 'ext': 'flv', - 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', - 'duration': 27, - 'age_limit': 0, - }, - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', @@ -2012,6 +2008,50 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # PeerTube embed + 'url': 'https://joinpeertube.org/fr/home/', + 'info_dict': { + 'id': 'home', + 'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube', + }, + 'playlist_count': 2, + }, + { + # Indavideo embed + 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/', + 'info_dict': { + 'id': '1693903', + 'ext': 'mp4', + 'title': 'Így kell otthon hamburgert sütni', + 'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7', + 'timestamp': 1426330212, + 'upload_date': '20150314', + 'uploader': 'StreetKitchen', + 'uploader_id': '546363', + }, + 'add_ie': [IndavideoEmbedIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, + { + # APA embed via JWPlatform embed + 'url': 'http://www.vol.at/blue-man-group/5593454', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -2022,6 +2062,15 @@ class GenericIE(InfoExtractor): }, 'skip': 'TODO: fix nested playlists processing in tests', }, + { + # Viqeo embeds + 'url': 'https://viqeo.tv/', + 'info_dict': { + 'id': 'viqeo', + 'title': 'All-new video platform', + }, + 'playlist_count': 6, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -3029,6 +3078,26 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) + peertube_urls = PeerTubeIE._extract_urls(webpage, url) + if peertube_urls: + return self.playlist_from_matches( + peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) + if indavideo_urls: + return self.playlist_from_matches( + indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) + + apa_urls = APAIE._extract_urls(webpage) + if apa_urls: + return self.playlist_from_matches( + apa_urls, video_id, video_title, ie=APAIE.ie_key()) + + foxnews_urls = FoxNewsIE._extract_urls(webpage) + if foxnews_urls: + return self.playlist_from_matches( + foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] @@ -3036,6 +3105,16 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( sharevideos_urls, video_id, video_title) + viqeo_urls = ViqeoIE._extract_urls(webpage) + if viqeo_urls: + return self.playlist_from_matches( + viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) + + expressen_urls = ExpressenIE._extract_urls(webpage) + if expressen_urls: + return self.playlist_from_matches( + expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: @@ -3073,8 +3152,8 @@ class GenericIE(InfoExtractor): sources = [sources] formats = [] for source in sources: - src = source.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(source.get('src')) + if not src: continue src = compat_urlparse.urljoin(url, src) src_type = source.get('type') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index dc7b2661c..c2140c362 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -1,15 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import hashlib +import json import random import re -import math from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_str, - compat_chr, - compat_ord, ) from ..utils import ( ExtractorError, @@ -22,12 +23,7 @@ from ..utils import ( class GloboIE(InfoExtractor): _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' - - _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' - _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' - - _RESIGN_EXPIRATION = 86400 - + _NETRC_MACHINE = 'globo' _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', @@ -70,287 +66,51 @@ class GloboIE(InfoExtractor): 'only_matching': True, }] - class MD5(object): - HEX_FORMAT_LOWERCASE = 0 - HEX_FORMAT_UPPERCASE = 1 - BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' - BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '=' - PADDING = '=0xFF01DD' - hexcase = 0 - b64pad = '' + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return - def __init__(self): - pass - - class JSArray(list): - def __getitem__(self, y): - try: - return list.__getitem__(self, y) - except IndexError: - return 0 - - def __setitem__(self, i, y): - try: - return list.__setitem__(self, i, y) - except IndexError: - self.extend([0] * (i - len(self) + 1)) - self[-1] = y - - @classmethod - def hex_md5(cls, param1): - return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1))) - - @classmethod - def b64_md5(cls, param1, param2=None): - return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2))) - - @classmethod - def any_md5(cls, param1, param2): - return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2) - - @classmethod - def rstr_md5(cls, param1): - return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8)) - - @classmethod - def rstr2hex(cls, param1): - _loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef' - _loc_3 = '' - for _loc_5 in range(0, len(param1)): - _loc_4 = compat_ord(param1[_loc_5]) - _loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15] - return _loc_3 - - @classmethod - def rstr2b64(cls, param1): - _loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' - _loc_3 = '' - _loc_4 = len(param1) - for _loc_5 in range(0, _loc_4, 3): - _loc_6_1 = compat_ord(param1[_loc_5]) << 16 - _loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0 - _loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0 - _loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3 - for _loc_7 in range(0, 4): - if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8: - _loc_3 += cls.b64pad - else: - _loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63] - return _loc_3 - - @staticmethod - def rstr2any(param1, param2): - _loc_3 = len(param2) - _loc_4 = [] - _loc_9 = [0] * ((len(param1) >> 2) + 1) - for _loc_5 in range(0, len(_loc_9)): - _loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1]) - - while len(_loc_9) > 0: - _loc_8 = [] - _loc_7 = 0 - for _loc_5 in range(0, len(_loc_9)): - _loc_7 = (_loc_7 << 16) + _loc_9[_loc_5] - _loc_6 = math.floor(_loc_7 / _loc_3) - _loc_7 -= _loc_6 * _loc_3 - if len(_loc_8) > 0 or _loc_6 > 0: - _loc_8[len(_loc_8)] = _loc_6 - - _loc_4[len(_loc_4)] = _loc_7 - _loc_9 = _loc_8 - - _loc_10 = '' - _loc_5 = len(_loc_4) - 1 - while _loc_5 >= 0: - _loc_10 += param2[_loc_4[_loc_5]] - _loc_5 -= 1 - - return _loc_10 - - @classmethod - def str2rstr_utf8(cls, param1, param2=None): - _loc_3 = '' - _loc_4 = -1 - if not param2: - param2 = cls.PADDING - param1 = param1 + param2[1:9] - while True: - _loc_4 += 1 - if _loc_4 >= len(param1): - break - _loc_5 = compat_ord(param1[_loc_4]) - _loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0 - if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343: - _loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023) - _loc_4 += 1 - if _loc_5 <= 127: - _loc_3 += compat_chr(_loc_5) - continue - if _loc_5 <= 2047: - _loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63) - continue - if _loc_5 <= 65535: - _loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr( - 128 | _loc_5 & 63) - continue - if _loc_5 <= 2097151: - _loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr( - 128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63) - return _loc_3 - - @staticmethod - def rstr2binl(param1): - _loc_2 = [0] * ((len(param1) >> 2) + 1) - for _loc_3 in range(0, len(_loc_2)): - _loc_2[_loc_3] = 0 - for _loc_3 in range(0, len(param1) * 8, 8): - _loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32 - return _loc_2 - - @staticmethod - def binl2rstr(param1): - _loc_2 = '' - for _loc_3 in range(0, len(param1) * 32, 8): - _loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255) - return _loc_2 - - @classmethod - def binl_md5(cls, param1, param2): - param1 = cls.JSArray(param1) - param1[param2 >> 5] |= 128 << param2 % 32 - param1[(param2 + 64 >> 9 << 4) + 14] = param2 - _loc_3 = 1732584193 - _loc_4 = -271733879 - _loc_5 = -1732584194 - _loc_6 = 271733878 - for _loc_7 in range(0, len(param1), 16): - _loc_8 = _loc_3 - _loc_9 = _loc_4 - _loc_10 = _loc_5 - _loc_11 = _loc_6 - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551) - _loc_3 = cls.safe_add(_loc_3, _loc_8) - _loc_4 = cls.safe_add(_loc_4, _loc_9) - _loc_5 = cls.safe_add(_loc_5, _loc_10) - _loc_6 = cls.safe_add(_loc_6, _loc_11) - return [_loc_3, _loc_4, _loc_5, _loc_6] - - @classmethod - def md5_cmn(cls, param1, param2, param3, param4, param5, param6): - return cls.safe_add( - cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3) - - @classmethod - def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7) - - @classmethod - def safe_add(cls, param1, param2): - _loc_3 = (param1 & 65535) + (param2 & 65535) - _loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16) - return cls.lshift(_loc_4, 16) | _loc_3 & 65535 - - @classmethod - def bit_rol(cls, param1, param2): - return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2) - - @staticmethod - def lshift(value, count): - r = (0xFFFFFFFF & value) << count - return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r + try: + self._download_json( + 'https://login.globo.com/api/authentication', None, data=json.dumps({ + 'payload': { + 'email': email, + 'password': password, + 'serviceId': 4654, + }, + }).encode(), headers={ + 'Content-Type': 'application/json; charset=utf-8', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read(), None) + raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True) + raise def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] + 'http://api.globovideos.com/videos/%s/playlist' % video_id, + video_id)['videos'][0] title = video['title'] formats = [] for resource in video['resources']: resource_id = resource.get('_id') - if not resource_id or resource_id.endswith('manifest'): + resource_url = resource.get('url') + if not resource_id or not resource_url: continue security = self._download_json( - self._SECURITY_URL_TEMPLATE % (video_id, resource_id), - video_id, 'Downloading security hash for %s' % resource_id) + 'http://security.video.globo.com/videos/%s/hash' % video_id, + video_id, 'Downloading security hash for %s' % resource_id, query={ + 'player': 'flash', + 'version': '17.0.0.132', + 'resource_id': resource_id, + }) security_hash = security.get('hash') if not security_hash: @@ -361,22 +121,28 @@ class GloboIE(InfoExtractor): continue hash_code = security_hash[:2] - received_time = int(security_hash[2:12]) + received_time = security_hash[2:12] received_random = security_hash[12:22] received_md5 = security_hash[22:] - sign_time = received_time + self._RESIGN_EXPIRATION + sign_time = compat_str(int(received_time) + 86400) padding = '%010d' % random.randint(1, 10000000000) - signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding) - signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 + md5_data = (received_md5 + sign_time + padding + '0xFF01DD').encode() + signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') + signed_hash = hash_code + received_time + received_random + sign_time + padding + signed_md5 - resource_url = resource['url'] signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats( signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif resource_id.endswith('mpd') or resource_url.endswith('.mpd'): + formats.extend(self._extract_mpd_formats( + signed_url, resource_id, mpd_id='dash', fatal=False)) + elif resource_id.endswith('manifest') or resource_url.endswith('/manifest'): + formats.extend(self._extract_ism_formats( + signed_url, resource_id, ism_id='mss', fatal=False)) else: formats.append({ 'url': signed_url, diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 9b2e1c164..c3ea717bc 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -4,16 +4,19 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( determine_ext, + ExtractorError, int_or_none, + parse_age_limit, parse_iso8601, ) class Go90IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?go90\.com/videos/(?P<id>[0-9a-zA-Z]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?go90\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'https://www.go90.com/videos/84BUqjLpf9D', 'md5': 'efa7670dbbbf21a7b07b360652b24a32', 'info_dict': { @@ -23,16 +26,35 @@ class Go90IE(InfoExtractor): 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.', 'timestamp': 1491868800, 'upload_date': '20170411', + 'age_limit': 14, } - } + }, { + 'url': 'https://www.go90.com/embed/261MflWkD3N', + 'only_matching': True, + }] + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://www.go90.com/api/view/items/' + video_id, - video_id, headers={ + + try: + headers = self.geo_verification_headers() + headers.update({ 'Content-Type': 'application/json; charset=utf-8', - }, data=b'{"client":"web","device_type":"pc"}') + }) + video_data = self._download_json( + 'https://www.go90.com/api/view/items/' + video_id, video_id, + headers=headers, data=b'{"client":"web","device_type":"pc"}') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + message = self._parse_json(e.cause.read().decode(), None)['error']['message'] + if 'region unavailable' in message: + self.raise_geo_restricted(countries=['US']) + raise ExtractorError(message, expected=True) + raise + + if video_data.get('requires_drm'): + raise ExtractorError('This video is DRM protected.', expected=True) main_video_asset = video_data['main_video_asset'] episode_number = int_or_none(video_data.get('episode_number')) @@ -123,4 +145,5 @@ class Go90IE(InfoExtractor): 'season_number': season_number, 'episode_number': episode_number, 'subtitles': subtitles, + 'age_limit': parse_age_limit(video_data.get('rating')), } diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index eee517071..f26f80265 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -8,6 +8,7 @@ from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + url_or_none, urlencode_postdata, ) @@ -17,6 +18,8 @@ class HiDiveIE(InfoExtractor): # Using X-Forwarded-For results in 403 HTTP error for HLS fragments, # so disabling geo bypass completely _GEO_BYPASS = False + _NETRC_MACHINE = 'hidive' + _LOGIN_URL = 'https://www.hidive.com/account/login' _TESTS = [{ 'url': 'https://www.hidive.com/stream/the-comic-artist-and-his-assistants/s01e001', @@ -31,8 +34,26 @@ class HiDiveIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Requires Authentication', }] + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + webpage = self._download_webpage(self._LOGIN_URL, None) + form = self._search_regex( + r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>', + webpage, 'login form') + data = self._hidden_inputs(form) + data.update({ + 'Email': email, + 'Password': password, + }) + self._download_webpage( + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title, key = mobj.group('title', 'key') @@ -43,6 +64,7 @@ class HiDiveIE(InfoExtractor): data=urlencode_postdata({ 'Title': title, 'Key': key, + 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783', })) restriction = settings.get('restrictionReason') @@ -59,8 +81,8 @@ class HiDiveIE(InfoExtractor): bitrates = rendition.get('bitrates') if not isinstance(bitrates, dict): continue - m3u8_url = bitrates.get('hls') - if not isinstance(m3u8_url, compat_str): + m3u8_url = url_or_none(bitrates.get('hls')) + if not m3u8_url: continue formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', @@ -72,13 +94,13 @@ class HiDiveIE(InfoExtractor): if not isinstance(cc_file, list) or len(cc_file) < 3: continue cc_lang = cc_file[0] - cc_url = cc_file[2] - if not isinstance(cc_lang, compat_str) or not isinstance( - cc_url, compat_str): + cc_url = url_or_none(cc_file[2]) + if not isinstance(cc_lang, compat_str) or not cc_url: continue subtitles.setdefault(cc_lang, []).append({ 'url': cc_url, }) + self._sort_formats(formats) season_number = int_or_none(self._search_regex( r's(\d+)', key, 'season number', default=None)) diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py index 6424d34ac..9ba1aa703 100644 --- a/youtube_dl/extractor/hrti.py +++ b/youtube_dl/extractor/hrti.py @@ -66,7 +66,7 @@ class HRTiBaseIE(InfoExtractor): self._logout_url = modules['user']['resources']['logout']['uri'] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # TODO: figure out authentication with cookies if username is None or password is None: self.raise_login_required() diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 425421968..436759da5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -3,27 +3,27 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, mimetype2ext, + parse_duration, qualities, - remove_end, + url_or_none, ) class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).+?[/-]vi(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', - 'title': 'Ice Age: Continental Drift Trailer (No. 2)', - 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', + 'title': 'No. 2 from Ice Age: Continental Drift (2012)', + 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', } }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', @@ -40,82 +40,68 @@ class ImdbIE(InfoExtractor): }, { 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561', 'only_matching': True, + }, { + 'url': 'https://www.imdb.com/list/ls009921623/videoplayer/vi260482329', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id) - descr = self._html_search_regex( - r'(?s)<span itemprop="description">(.*?)</span>', - webpage, 'description', fatal=False) - player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id - player_page = self._download_webpage( - player_url, video_id, 'Downloading player page') - # the player page contains the info for the default format, we have to - # fetch other pages for the rest of the formats - extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page) - format_pages = [ - self._download_webpage( - f_url, video_id, 'Downloading info for %s format' % f_name) - for f_url, f_name in extra_formats] - format_pages.append(player_page) + webpage = self._download_webpage( + 'https://www.imdb.com/videoplayer/vi' + video_id, video_id) + video_metadata = self._parse_json(self._search_regex( + r'window\.IMDbReactInitialState\.push\(({.+?})\);', webpage, + 'video metadata'), video_id)['videos']['videoMetadata']['vi' + video_id] + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage) or self._html_search_regex( + r'<title>(.+?)', webpage, 'title', fatal=False) or video_metadata['title'] quality = qualities(('SD', '480p', '720p', '1080p')) formats = [] - for format_page in format_pages: - json_data = self._search_regex( - r']+class="imdb-player-data"[^>]*?>(.*?)', - format_page, 'json data', flags=re.DOTALL) - info = self._parse_json(json_data, video_id, fatal=False) - if not info: + for encoding in video_metadata.get('encodings', []): + if not encoding or not isinstance(encoding, dict): continue - format_info = info.get('videoPlayerObject', {}).get('video', {}) - if not format_info: + video_url = url_or_none(encoding.get('videoUrl')) + if not video_url: continue - video_info_list = format_info.get('videoInfoList') - if not video_info_list or not isinstance(video_info_list, list): + ext = mimetype2ext(encoding.get( + 'mimeType')) or determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) continue - for video_info in video_info_list: - if not video_info or not isinstance(video_info, dict): - continue - video_url = video_info.get('videoUrl') - if not video_url or not isinstance(video_url, compat_str): - continue - if (video_info.get('videoMimeType') == 'application/x-mpegURL' or - determine_ext(video_url) == 'm3u8'): - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - format_id = format_info.get('ffname') - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': mimetype2ext(video_info.get('videoMimeType')), - 'quality': quality(format_id), - }) + format_id = encoding.get('definition') + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + 'quality': quality(format_id), + }) self._sort_formats(formats) return { 'id': video_id, - 'title': remove_end(self._og_search_title(webpage), ' - IMDb'), + 'title': title, 'formats': formats, - 'description': descr, - 'thumbnail': format_info.get('slate'), + 'description': video_metadata.get('description'), + 'thumbnail': video_metadata.get('slate', {}).get('url'), + 'duration': parse_duration(video_metadata.get('duration')), } class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/(?P[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P\d{9})(?!/videoplayer/vi\d+)' _TEST = { - 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', + 'url': 'https://www.imdb.com/list/ls009921623/', 'info_dict': { - 'id': 'JFs9NWw6XI0', - 'title': 'March 23, 2012 Releases', + 'id': '009921623', + 'title': 'The Bourne Legacy', + 'description': 'A list of trailers, clips, and more from The Bourne Legacy, starring Jeremy Renner and Rachel Weisz.', }, - 'playlist_count': 7, + 'playlist_count': 8, } def _real_extract(self, url): @@ -123,9 +109,13 @@ class ImdbListIE(InfoExtractor): webpage = self._download_webpage(url, list_id) entries = [ self.url_result('http://www.imdb.com' + m, 'Imdb') - for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)] + for m in re.findall(r'href="(/list/ls%s/videoplayer/vi[^"]+)"' % list_id, webpage)] list_title = self._html_search_regex( - r'

(.*?)

', webpage, 'list title') + r']+class="[^"]*header[^"]*"[^>]*>(.*?)', + webpage, 'list title') + list_description = self._html_search_regex( + r']+class="[^"]*list-description[^"]*"[^>]*>

(.*?)

', + webpage, 'list description') - return self.playlist_result(entries, list_id, list_title) + return self.playlist_result(entries, list_id, list_title, list_description) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 67c24a51c..ecc958a17 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, @@ -13,7 +12,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -21,7 +20,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The most awesome images on the Internet.', + 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/A61SaA1', @@ -29,7 +28,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The most awesome images on the Internet.', + 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/gallery/YcAQlkx', @@ -37,8 +36,6 @@ class ImgurIE(InfoExtractor): 'id': 'YcAQlkx', 'ext': 'mp4', 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - 'description': 'Imgur: The most awesome images on the Internet.' - } }, { 'url': 'http://imgur.com/topic/Funny/N8rOudd', @@ -46,12 +43,15 @@ class ImgurIE(InfoExtractor): }, { 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, + }, { + 'url': 'https://i.imgur.com/crGpqCV.mp4', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - compat_urlparse.urljoin(url, video_id), video_id) + gifv_url = 'https://i.imgur.com/{id}.gifv'.format(id=video_id) + webpage = self._download_webpage(gifv_url, video_id) width = int_or_none(self._og_search_property( 'video:width', webpage, default=None)) @@ -107,7 +107,7 @@ class ImgurIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'description': self._og_search_description(webpage), + 'description': self._og_search_description(webpage, default=None), 'title': self._og_search_title(webpage), } diff --git a/youtube_dl/extractor/inc.py b/youtube_dl/extractor/inc.py index 241ec83c4..d5b258a0f 100644 --- a/youtube_dl/extractor/inc.py +++ b/youtube_dl/extractor/inc.py @@ -21,6 +21,21 @@ class IncIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # div with id=kaltura_player_1_kqs38cgm + 'url': 'https://www.inc.com/oscar-raymundo/richard-branson-young-entrepeneurs.html', + 'info_dict': { + 'id': '1_kqs38cgm', + 'ext': 'mp4', + 'title': 'Branson: "In the end, you have to say, Screw it. Just do it."', + 'description': 'md5:21b832d034f9af5191ca5959da5e9cb6', + 'timestamp': 1364403232, + 'upload_date': '20130327', + 'uploader_id': 'incdigital@inc.com', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.inc.com/video/david-whitford/founders-forum-tripadvisor-steve-kaufer-most-enjoyable-moment-for-entrepreneur.html', 'only_matching': True, @@ -31,10 +46,13 @@ class IncIE(InfoExtractor): webpage = self._download_webpage(url, display_id) partner_id = self._search_regex( - r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, 'partner id') + r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, + 'partner id', default='1034971') - kaltura_id = self._parse_json(self._search_regex( - r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), + kaltura_id = self._search_regex( + r'id=(["\'])kaltura_player_(?P.+?)\1', webpage, 'kaltura id', + default=None, group='id') or self._parse_json(self._search_regex( + r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), display_id)['vid_kaltura_id'] return self.url_result( diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 11cf3c609..2b5b2b5b0 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -1,11 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, parse_age_limit, parse_iso8601, + update_url_query, ) @@ -13,7 +17,7 @@ class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P[\da-f]+)' _TESTS = [{ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', - 'md5': 'f79b009c66194acacd40712a6778acfa', + 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', 'info_dict': { 'id': '1837039', 'ext': 'mp4', @@ -36,6 +40,20 @@ class IndavideoEmbedIE(InfoExtractor): 'only_matching': True, }] + # Some example URLs covered by generic extractor: + # http://indavideo.hu/video/Vicces_cica_1 + # http://index.indavideo.hu/video/2015_0728_beregszasz + # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # http://erotika.indavideo.hu/video/Amator_tini_punci + # http://film.indavideo.hu/video/f_hrom_nagymamm_volt + # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\'](?P(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) @@ -45,7 +63,14 @@ class IndavideoEmbedIE(InfoExtractor): title = video['title'] - video_urls = video.get('video_files', []) + video_urls = [] + + video_files = video.get('video_files') + if isinstance(video_files, list): + video_urls.extend(video_files) + elif isinstance(video_files, dict): + video_urls.extend(video_files.values()) + video_file = video.get('video_file') if video: video_urls.append(video_file) @@ -58,11 +83,23 @@ class IndavideoEmbedIE(InfoExtractor): if flv_url not in video_urls: video_urls.append(flv_url) - formats = [{ - 'url': video_url, - 'height': int_or_none(self._search_regex( - r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)), - } for video_url in video_urls] + filesh = video.get('filesh') + + formats = [] + for video_url in video_urls: + height = int_or_none(self._search_regex( + r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) + if filesh: + if not height: + continue + token = filesh.get(compat_str(height)) + if token is None: + continue + video_url = update_url_query(video_url, {'token': token}) + formats.append({ + 'url': video_url, + 'height': height, + }) self._sort_formats(formats) timestamp = video.get('date') @@ -89,55 +126,3 @@ class IndavideoEmbedIE(InfoExtractor): 'tags': tags, 'formats': formats, } - - -class IndavideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' - _TESTS = [{ - 'url': 'http://indavideo.hu/video/Vicces_cica_1', - 'md5': '8c82244ba85d2a2310275b318eb51eac', - 'info_dict': { - 'id': '1335611', - 'display_id': 'Vicces_cica_1', - 'ext': 'mp4', - 'title': 'Vicces cica', - 'description': 'Játszik a tablettel. :D', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Jet_Pack', - 'uploader_id': '491217', - 'timestamp': 1390821212, - 'upload_date': '20140127', - 'duration': 7, - 'age_limit': 0, - 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], - }, - }, { - 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz', - 'only_matching': True, - }, { - 'url': 'http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko', - 'only_matching': True, - }, { - 'url': 'http://erotika.indavideo.hu/video/Amator_tini_punci', - 'only_matching': True, - }, { - 'url': 'http://film.indavideo.hu/video/f_hrom_nagymamm_volt', - 'only_matching': True, - }, { - 'url': 'http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - embed_url = self._search_regex( - r']+rel="video_src"[^>]+href="(.+?)"', webpage, 'embed url') - - return { - '_type': 'url_transparent', - 'ie_key': 'IndavideoEmbed', - 'url': embed_url, - 'display_id': display_id, - } diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 0c13f54ee..7e0e838f0 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -17,6 +17,7 @@ from ..utils import ( lowercase_escape, std_headers, try_get, + url_or_none, ) @@ -170,7 +171,7 @@ class InstagramIE(InfoExtractor): node = try_get(edge, lambda x: x['node'], dict) if not node: continue - node_video_url = try_get(node, lambda x: x['video_url'], compat_str) + node_video_url = url_or_none(node.get('video_url')) if not node_video_url: continue entries.append({ diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index fdfa7de9e..4b081bd46 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -239,7 +239,7 @@ class IqiyiIE(InfoExtractor): return ohdave_rsa_encrypt(data, e, N) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if not username: diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 6a4f8a505..de65b6bb4 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -13,15 +13,17 @@ from ..compat import ( compat_etree_register_namespace, ) from ..utils import ( + determine_ext, + ExtractorError, extract_attributes, + int_or_none, + merge_dicts, + parse_duration, + smuggle_url, + url_or_none, xpath_with_ns, xpath_element, xpath_text, - int_or_none, - parse_duration, - smuggle_url, - ExtractorError, - determine_ext, ) @@ -129,64 +131,65 @@ class ITVIE(InfoExtractor): resp_env = self._download_xml( params['data-playlist-url'], video_id, - headers=headers, data=etree.tostring(req_env)) - playlist = xpath_element(resp_env, './/Playlist') - if playlist is None: - fault_code = xpath_text(resp_env, './/faultcode') - fault_string = xpath_text(resp_env, './/faultstring') - if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted( - msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code not in ( - 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, fault_string), expected=True) - info.update({ - 'title': self._og_search_title(webpage), - 'episode_title': params.get('data-video-episode'), - 'series': params.get('data-video-title'), - }) - else: - title = xpath_text(playlist, 'EpisodeTitle', default=None) - info.update({ - 'title': title, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duration': parse_duration(xpath_text(playlist, 'Duration')), - }) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] + headers=headers, data=etree.tostring(req_env), fatal=False) + if resp_env: + playlist = xpath_element(resp_env, './/Playlist') + if playlist is None: + fault_code = xpath_text(resp_env, './/faultcode') + fault_string = xpath_text(resp_env, './/faultstring') + if fault_code == 'InvalidGeoRegion': + self.raise_geo_restricted( + msg=fault_string, countries=self._GEO_COUNTRIES) + elif fault_code not in ( + 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, fault_string), expected=True) + info.update({ + 'title': self._og_search_title(webpage), + 'episode_title': params.get('data-video-episode'), + 'series': params.get('data-video-title'), + }) + else: + title = xpath_text(playlist, 'EpisodeTitle', default=None) + info.update({ + 'title': title, + 'episode_title': title, + 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), + 'series': xpath_text(playlist, 'ProgrammeTitle'), + 'duration': parse_duration(xpath_text(playlist, 'Duration')), + }) + video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) + media_files = xpath_element(video_element, 'MediaFiles', fatal=True) + rtmp_url = media_files.attrib['base'] - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) + for media_file in media_files.findall('MediaFile'): + play_path = xpath_text(media_file, 'URL') + if not play_path: + continue + tbr = int_or_none(media_file.get('bitrate'), 1000) + f = { + 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), + 'play_path': play_path, + # Providing this swfVfy allows to avoid truncated downloads + 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', + 'page_url': url, + 'tbr': tbr, + 'ext': 'flv', + } + app = self._search_regex( + 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) + if app: + f.update({ + 'url': rtmp_url.split('?', 1)[0], + 'app': app, + }) + else: + f['url'] = rtmp_url + formats.append(f) - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if caption_url.text: - extract_subtitle(caption_url.text) + for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): + if caption_url.text: + extract_subtitle(caption_url.text) ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') hmac = params.get('data-video-hmac') @@ -248,8 +251,8 @@ class ITVIE(InfoExtractor): for sub in subs: if not isinstance(sub, dict): continue - href = sub.get('Href') - if isinstance(href, compat_str): + href = url_or_none(sub.get('Href')) + if href: extract_subtitle(href) if not info.get('duration'): info['duration'] = parse_duration(video_data.get('Duration')) @@ -261,7 +264,17 @@ class ITVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, }) - return info + + webpage_info = self._search_json_ld(webpage, video_id, default={}) + if not webpage_info.get('title'): + webpage_info['title'] = self._html_search_regex( + r'(?s)]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or webpage_info['episode'] + + return merge_dicts(info, webpage_info) class ITVBTCCIE(InfoExtractor): diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index a7514fc80..907d5fc8b 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, mimetype2ext, remove_end, + url_or_none, ) @@ -73,11 +74,14 @@ class IwaraIE(InfoExtractor): formats = [] for a_format in video_data: + format_uri = url_or_none(a_format.get('uri')) + if not format_uri: + continue format_id = a_format.get('resolution') height = int_or_none(self._search_regex( r'(\d+)p', format_id, 'height', default=None)) formats.append({ - 'url': a_format['uri'], + 'url': self._proto_relative_url(format_uri, 'https:'), 'format_id': format_id, 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', 'height': height, diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index b1d72177d..f8fca6c8f 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -1,10 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( determine_ext, float_or_none, @@ -57,12 +58,33 @@ class IzleseneIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://www.izlesene.com/video/%s' % video_id - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('http://www.izlesene.com/video/%s' % video_id, video_id) + + video = self._parse_json( + self._search_regex( + r'videoObj\s*=\s*({.+?})\s*;\s*\n', webpage, 'streams'), + video_id) + + title = video.get('videoTitle') or self._og_search_title(webpage) + + formats = [] + for stream in video['media']['level']: + source_url = stream.get('source') + if not source_url or not isinstance(source_url, compat_str): + continue + ext = determine_ext(url, 'mp4') + quality = stream.get('value') + height = int_or_none(quality) + formats.append({ + 'format_id': '%sp' % quality if quality else 'sd', + 'url': compat_urllib_parse_unquote(source_url), + 'ext': ext, + 'height': height, + }) + self._sort_formats(formats) - title = self._og_search_title(webpage) description = self._og_search_description(webpage, default=None) - thumbnail = self._proto_relative_url( + thumbnail = video.get('posterURL') or self._proto_relative_url( self._og_search_thumbnail(webpage), scheme='http:') uploader = self._html_search_regex( @@ -71,41 +93,15 @@ class IzleseneIE(InfoExtractor): timestamp = parse_iso8601(self._html_search_meta( 'uploadDate', webpage, 'upload date')) - duration = float_or_none(self._html_search_regex( - r'"videoduration"\s*:\s*"([^"]+)"', - webpage, 'duration', fatal=False), scale=1000) + duration = float_or_none(video.get('duration') or self._html_search_regex( + r'videoduration["\']?\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'duration', fatal=False, group='value'), scale=1000) view_count = str_to_int(get_element_by_id('videoViewCount', webpage)) comment_count = self._html_search_regex( r'comment_count\s*=\s*\'([^\']+)\';', webpage, 'comment_count', fatal=False) - content_url = self._html_search_meta( - 'contentURL', webpage, 'content URL', fatal=False) - ext = determine_ext(content_url, 'mp4') - - # Might be empty for some videos. - streams = self._html_search_regex( - r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='') - - formats = [] - if streams: - for stream in streams.split('|'): - quality, url = re.search(r'\[(\w+)\](.+)', stream).groups() - formats.append({ - 'format_id': '%sp' % quality if quality else 'sd', - 'url': compat_urllib_parse_unquote(url), - 'ext': ext, - }) - else: - stream_url = self._search_regex( - r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL') - formats.append({ - 'format_id': 'sd', - 'url': compat_urllib_parse_unquote(stream_url), - 'ext': ext, - }) - return { 'id': video_id, 'title': title, diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index a764023e9..d9f8dbfd2 100644 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -18,7 +18,7 @@ class JojIE(InfoExtractor): joj:| https?://media\.joj\.sk/embed/ ) - (?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + (?P[^/?#^]+) ''' _TESTS = [{ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', @@ -29,16 +29,24 @@ class JojIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 3118, } + }, { + 'url': 'https://media.joj.sk/embed/9i1cxv', + 'only_matching': True, }, { 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', 'only_matching': True, + }, { + 'url': 'joj:9i1cxv', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): - return re.findall( - r']+\bsrc=["\'](?P(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - webpage) + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', + webpage)] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index d4e6f7ac1..c3eb74c17 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -4,16 +4,14 @@ import re from .common import InfoExtractor from ..aes import aes_decrypt_text -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_urllib_parse_unquote from ..utils import ( determine_ext, ExtractorError, int_or_none, str_to_int, strip_or_none, + url_or_none, ) @@ -55,7 +53,8 @@ class KeezMoviesIE(InfoExtractor): encrypted = False def extract_format(format_url, height=None): - if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//')): return if format_url in format_urls: return diff --git a/youtube_dl/extractor/konserthusetplay.py b/youtube_dl/extractor/konserthusetplay.py index c11cbcf47..dd42bb2f2 100644 --- a/youtube_dl/extractor/konserthusetplay.py +++ b/youtube_dl/extractor/konserthusetplay.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, int_or_none, + url_or_none, ) @@ -109,7 +109,8 @@ class KonserthusetPlayIE(InfoExtractor): captions = source.get('captionsAvailableLanguages') if isinstance(captions, dict): for lang, subtitle_url in captions.items(): - if lang != 'none' and isinstance(subtitle_url, compat_str): + subtitle_url = url_or_none(subtitle_url) + if lang != 'none' and subtitle_url: subtitles.setdefault(lang, []).append({'url': subtitle_url}) return { diff --git a/youtube_dl/extractor/lci.py b/youtube_dl/extractor/lci.py index af34829e7..920872f5c 100644 --- a/youtube_dl/extractor/lci.py +++ b/youtube_dl/extractor/lci.py @@ -20,5 +20,7 @@ class LCIIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - wat_id = self._search_regex(r'data-watid=[\'"](\d+)', webpage, 'wat id') + wat_id = self._search_regex( + (r'data-watid=[\'"](\d+)', r'idwat["\']?\s*:\s*["\']?(\d+)'), + webpage, 'wat id') return self.url_result('wat:' + wat_id, 'Wat', wat_id) diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index ffe10154b..8dd1ce0d0 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -130,7 +130,7 @@ class LeIE(InfoExtractor): media_id, 'Downloading flash playJson data', query={ 'id': media_id, 'platid': 1, - 'splatid': 101, + 'splatid': 105, 'format': 1, 'source': 1000, 'tkey': self.calc_time_key(int(time.time())), diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index f5c7abc13..4ba61cd8a 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_str, compat_urlparse, ) @@ -44,21 +43,15 @@ class LyndaBaseIE(InfoExtractor): form_data = self._hidden_inputs(form_html) form_data.update(extra_form_data) - try: - response = self._download_json( - action_url, None, note, - data=urlencode_postdata(form_data), - headers={ - 'Referer': referrer_url, - 'X-Requested-With': 'XMLHttpRequest', - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: - response = self._parse_json(e.cause.read().decode('utf-8'), None) - self._check_error(response, ('email', 'password')) - raise + response = self._download_json( + action_url, None, note, + data=urlencode_postdata(form_data), + headers={ + 'Referer': referrer_url, + 'X-Requested-With': 'XMLHttpRequest', + }, expected_status=(418, 500, )) - self._check_error(response, 'ErrorMessage') + self._check_error(response, ('email', 'password', 'ErrorMessage')) return response, action_url diff --git a/youtube_dl/extractor/markiza.py b/youtube_dl/extractor/markiza.py new file mode 100644 index 000000000..def960a0c --- /dev/null +++ b/youtube_dl/extractor/markiza.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + orderedSet, + parse_duration, + try_get, +) + + +class MarkizaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?videoarchiv\.markiza\.sk/(?:video/(?:[^/]+/)*|embed/)(?P\d+)(?:[_/]|$)' + _TESTS = [{ + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723_oteckovia-109', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139078', + 'ext': 'mp4', + 'title': 'Oteckovia 109', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2760, + }, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/televizne-noviny/televizne-noviny/85430_televizne-noviny', + 'info_dict': { + 'id': '85430', + 'title': 'Televízne noviny', + }, + 'playlist_count': 23, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/filmy/85190_kamenak', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/reflex/zo-zakulisia/84651_pribeh-alzbetky', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/embed/85295', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'http://videoarchiv.markiza.sk/json/video_jwplayer7.json', + video_id, query={'id': video_id}) + + info = self._parse_jwplayer_data(data, m3u8_id='hls', mpd_id='dash') + + if info.get('_type') == 'playlist': + info.update({ + 'id': video_id, + 'title': try_get( + data, lambda x: x['details']['name'], compat_str), + }) + else: + info['duration'] = parse_duration( + try_get(data, lambda x: x['details']['duration'], compat_str)) + return info + + +class MarkizaPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:(?:[^/]+\.)?markiza|tvnoviny)\.sk/(?:[^/]+/)*(?P\d+)_' + _TESTS = [{ + 'url': 'http://www.markiza.sk/soubiz/zahranicny/1923705_oteckovia-maju-svoj-den-ti-slavni-nie-su-o-nic-menej-rozkosni', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139355', + 'ext': 'mp4', + 'title': 'Oteckovia 110', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2604, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://dajto.markiza.sk/filmy-a-serialy/1774695_frajeri-vo-vegas', + 'only_matching': True, + }, { + 'url': 'http://superstar.markiza.sk/aktualne/1923870_to-je-ale-telo-spevacka-ukazala-sexy-postavicku-v-bikinach', + 'only_matching': True, + }, { + 'url': 'http://hybsa.markiza.sk/aktualne/1923790_uzasna-atmosfera-na-hybsa-v-poprade-superstaristi-si-prve-koncerty-pred-davom-ludi-poriadne-uzili', + 'only_matching': True, + }, { + 'url': 'http://doma.markiza.sk/filmy/1885250_moja-vysnivana-svadba', + 'only_matching': True, + }, { + 'url': 'http://www.tvnoviny.sk/domace/1923887_po-smrti-manzela-ju-cakalo-poriadne-prekvapenie', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MarkizaIE.suitable(url) else super(MarkizaPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage( + # Downloading for some hosts (e.g. dajto, doma) fails with 500 + # although everything seems to be OK, so considering 500 + # status code to be expected. + url, playlist_id, expected_status=500) + + entries = [ + self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id) + for video_id in orderedSet(re.findall( + r'(?:initPlayer_|data-entity=["\']|id=["\']player_)(\d+)', + webpage))] + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 9760eafd5..57f97409d 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -3,59 +3,75 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..compat import compat_str +from .theplatform import ThePlatformBaseIE from ..utils import ( - determine_ext, - parse_duration, - try_get, - unified_strdate, + ExtractorError, + int_or_none, + update_url_query, ) -class MediasetIE(InfoExtractor): +class MediasetIE(ThePlatformBaseIE): + _TP_TLD = 'eu' _VALID_URL = r'''(?x) (?: mediaset:| https?:// - (?:www\.)?video\.mediaset\.it/ + (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: (?:video|on-demand)/(?:[^/]+/)+[^/]+_| - player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= + player/index\.html\?.*?\bprogramGuid= ) - )(?P[0-9]+) + )(?P[0-9A-Z]{16}) ''' _TESTS = [{ # full episode - 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', + 'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824', 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', 'info_dict': { - 'id': '661824', + 'id': 'FAFU000000661824', 'ext': 'mp4', 'title': 'Quarta puntata', - 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1414, - 'creator': 'mediaset', + 'duration': 1414.26, 'upload_date': '20161107', 'series': 'Hello Goodbye', - 'categories': ['reality'], + 'timestamp': 1478532900, + 'uploader': 'Rete 4', + 'uploader_id': 'R4', }, - 'expected_warnings': ['is not a supported codec'], + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', + 'md5': '288532f0ad18307705b01e581304cd7b', + 'info_dict': { + 'id': 'F309013801000501', + 'ext': 'mp4', + 'title': 'Puntata del 25 maggio', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 6565.007, + 'upload_date': '20180526', + 'series': 'Matrix', + 'timestamp': 1527326245, + 'uploader': 'Canale 5', + 'uploader_id': 'C5', + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip - 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', + 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', 'only_matching': True, }, { # iframe simple - 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924', 'only_matching': True, }, { # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) - 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', 'only_matching': True, }, { - 'url': 'mediaset:661824', + 'url': 'mediaset:FAFU000000665924', 'only_matching': True, }] @@ -68,51 +84,54 @@ class MediasetIE(InfoExtractor): webpage)] def _real_extract(self, url): - video_id = self._match_id(url) - - video_list = self._download_json( - 'http://cdnsel01.mediaset.net/GetCdn.aspx', - video_id, 'Downloading video CDN JSON', query={ - 'streamid': video_id, - 'format': 'json', - })['videoList'] + guid = self._match_id(url) + tp_path = 'PR1GhC/media/guid/2702976343/' + guid + info = self._extract_theplatform_metadata(tp_path, guid) formats = [] - for format_url in video_list: - if '.ism' in format_url: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': determine_ext(format_url), - }) + subtitles = {} + first_e = None + for asset_type in ('SD', 'HD'): + for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'): + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { + 'mbr': 'true', + 'formats': f, + 'assetTypes': asset_type, + }), guid, 'Downloading %s %s SMIL data' % (f, asset_type)) + except ExtractorError as e: + if not first_e: + first_e = e + break + for tp_f in tp_formats: + tp_f['quality'] = 1 if asset_type == 'HD' else 0 + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if first_e and not formats: + raise first_e self._sort_formats(formats) - mediainfo = self._download_json( - 'http://plr.video.mediaset.it/html/metainfo.sjson', - video_id, 'Downloading video info JSON', query={ - 'id': video_id, - })['video'] + fields = [] + for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))): + fields.extend(templ % repl for repl in repls) + feed_data = self._download_json( + 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid, + guid, fatal=False, query={'fields': ','.join(fields)}) + if feed_data: + publish_info = feed_data.get('mediasetprogram$publishInfo') or {} + info.update({ + 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')), + 'season_number': int_or_none(feed_data.get('tvSeasonNumber')), + 'series': feed_data.get('mediasetprogram$brandTitle'), + 'uploader': publish_info.get('description'), + 'uploader_id': publish_info.get('channel'), + 'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')), + }) - title = mediainfo['title'] - - creator = try_get( - mediainfo, lambda x: x['brand-info']['publisher'], compat_str) - category = try_get( - mediainfo, lambda x: x['brand-info']['category'], compat_str) - categories = [category] if category else None - - return { - 'id': video_id, - 'title': title, - 'description': mediainfo.get('short-description'), - 'thumbnail': mediainfo.get('thumbnail'), - 'duration': parse_duration(mediainfo.get('duration')), - 'creator': creator, - 'upload_date': unified_strdate(mediainfo.get('production-date')), - 'webpage_url': mediainfo.get('url'), - 'series': mediainfo.get('brand-value'), - 'categories': categories, + info.update({ + 'id': guid, 'formats': formats, - } + 'subtitles': subtitles, + }) + return info diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 0e2645c55..84876b883 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -15,6 +15,7 @@ from ..utils import ( mimetype2ext, unescapeHTML, unsmuggle_url, + url_or_none, urljoin, ) @@ -156,8 +157,8 @@ class MediasiteIE(InfoExtractor): stream_formats = [] for unum, VideoUrl in enumerate(video_urls): - video_url = VideoUrl.get('Location') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(VideoUrl.get('Location')) + if not video_url: continue # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py index 959a10589..636731195 100644 --- a/youtube_dl/extractor/minoto.py +++ b/youtube_dl/extractor/minoto.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_codecs, +) class MinotoIE(InfoExtractor): @@ -26,7 +29,7 @@ class MinotoIE(InfoExtractor): formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False) else: fmt_profile = fmt.get('profile') or {} - f = { + formats.append({ 'format_id': fmt_profile.get('name-short'), 'format_note': fmt_profile.get('name'), 'url': fmt_url, @@ -35,16 +38,8 @@ class MinotoIE(InfoExtractor): 'filesize': int_or_none(fmt.get('filesize')), 'width': int_or_none(fmt.get('width')), 'height': int_or_none(fmt.get('height')), - } - codecs = fmt.get('codecs') - if codecs: - codecs = codecs.split(',') - if len(codecs) == 2: - f.update({ - 'vcodec': codecs[0], - 'acodec': codecs[1], - }) - formats.append(f) + 'codecs': parse_codecs(fmt.get('codecs')), + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 42759eae8..40f214a87 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,84 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import uuid - from .common import InfoExtractor -from .ooyala import OoyalaIE -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( int_or_none, - extract_attributes, - determine_ext, smuggle_url, parse_duration, ) -class MiTeleBaseIE(InfoExtractor): - def _get_player_info(self, url, webpage): - player_data = extract_attributes(self._search_regex( - r'(?s)()', - webpage, 'ms video player')) - video_id = player_data['data-media-id'] - if player_data.get('data-cms-id') == 'ooyala': - return self.url_result( - 'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id) - config_url = compat_urlparse.urljoin(url, player_data['data-config']) - config = self._download_json( - config_url, video_id, 'Downloading config JSON') - mmc_url = config['services']['mmc'] - - duration = None - formats = [] - for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')): - mmc = self._download_json( - m_url, video_id, 'Downloading mmc JSON') - if not duration: - duration = int_or_none(mmc.get('duration')) - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - gcp = location.get('gcp') - ogn = location.get('ogn') - if None in (gat, gcp, ogn): - continue - token_data = { - 'gcp': gcp, - 'ogn': ogn, - 'sta': 0, - } - media = self._download_json( - gat, video_id, data=json.dumps(token_data).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) - stream = media.get('stream') or media.get('file') - if not stream: - continue - ext = determine_ext(stream) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'), - 'duration': duration, - } - - class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P[^/]+)/player' @@ -86,7 +16,7 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player', 'info_dict': { - 'id': '57b0dfb9c715da65618b4afa', + 'id': 'FhYW1iNTE6J6H7NkQRIEzfne6t2quqPg', 'ext': 'mp4', 'title': 'Tor, la web invisible', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', @@ -104,7 +34,7 @@ class MiTeleIE(InfoExtractor): # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', 'info_dict': { - 'id': '57b0de3dc915da14058b4876', + 'id': 'oyNG1iNTE6TAPP-JmCjbwfwJqqMMX3Vq', 'ext': 'mp4', 'title': 'Cuarto Milenio Temporada 6 Programa 226', 'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f', @@ -128,40 +58,21 @@ class MiTeleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - gigya_url = self._search_regex( - r'[^>]*[^>]*[^>]*', - webpage, 'gigya', default=None) - gigya_sc = self._download_webpage( - compat_urlparse.urljoin('http://www.mitele.es/', gigya_url), - video_id, 'Downloading gigya script') - - # Get a appKey/uuid for getting the session key - appKey = self._search_regex( - r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)', - gigya_sc, 'appKey') - - session_json = self._download_json( - 'https://appgrid-api.cloud.accedo.tv/session', - video_id, 'Downloading session keys', query={ - 'appKey': appKey, - 'uuid': compat_str(uuid.uuid4()), - }) paths = self._download_json( - 'https://appgrid-api.cloud.accedo.tv/metadata/general_configuration,%20web_configuration', - video_id, 'Downloading paths JSON', - query={'sessionKey': compat_str(session_json['sessionKey'])}) + 'https://www.mitele.es/amd/agp/web/metadata/general_configuration', + video_id, 'Downloading paths JSON') ooyala_s = paths['general_configuration']['api_configuration']['ooyala_search'] + base_url = ooyala_s.get('base_url', 'cdn-search-mediaset.carbyne.ps.ooyala.com') + full_path = ooyala_s.get('full_path', '/search/v1/full/providers/') source = self._download_json( - 'http://%s%s%s/docs/%s' % ( - ooyala_s['base_url'], ooyala_s['full_path'], - ooyala_s['provider_id'], video_id), + '%s://%s%s%s/docs/%s' % ( + ooyala_s.get('protocol', 'https'), base_url, full_path, + ooyala_s.get('provider_id', '104951'), video_id), video_id, 'Downloading data JSON', query={ 'include_titles': 'Series,Season', - 'product_name': 'test', + 'product_name': ooyala_s.get('product_name', 'test'), 'format': 'full', })['hits']['hits'][0]['_source'] diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 675ff6873..b907f6b49 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,96 +1,90 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_iso8601, -) +from .nhl import NHLBaseIE -class MLBIE(InfoExtractor): +class MLBIE(NHLBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*mlb\.com/ + (?:[\da-z_-]+\.)*(?Pmlb)\.com/ (?: (?: - (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)| + (?:[^/]+/)*c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| )\?.*?\bcontent_id= ) - (?Pn?\d+)| - (?:[^/]+/)*(?P[^/]+) + (?P\d+) ) ''' + _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { - 'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', + 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', 'info_dict': { 'id': '34698933', 'ext': 'mp4', 'title': "Ackley's spectacular catch", 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', 'duration': 66, - 'timestamp': 1405980600, - 'upload_date': '20140721', + 'timestamp': 1405995000, + 'upload_date': '20140722', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', - 'md5': 'd9c022c10d21f849f49c05ae12a8a7e9', + 'url': 'https://www.mlb.com/video/stanton-prepares-for-derby/c-34496663', + 'md5': 'bf2619bf9cacc0a564fc35e6aeb9219f', 'info_dict': { 'id': '34496663', 'ext': 'mp4', 'title': 'Stanton prepares for Derby', 'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57', 'duration': 46, - 'timestamp': 1405105800, + 'timestamp': 1405120200, 'upload_date': '20140711', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby', - 'md5': '0e6e73d509321e142409b695eadd541f', + 'url': 'https://www.mlb.com/video/cespedes-repeats-as-derby-champ/c-34578115', + 'md5': '99bb9176531adc600b90880fb8be9328', 'info_dict': { 'id': '34578115', 'ext': 'mp4', 'title': 'Cespedes repeats as Derby champ', 'description': 'md5:08df253ce265d4cf6fb09f581fafad07', 'duration': 488, - 'timestamp': 1405399936, + 'timestamp': 1405414336, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance', - 'md5': 'b8fd237347b844365d74ea61d4245967', + 'url': 'https://www.mlb.com/video/bautista-on-home-run-derby/c-34577915', + 'md5': 'da8b57a12b060e7663ee1eebd6f330ec', 'info_dict': { 'id': '34577915', 'ext': 'mp4', 'title': 'Bautista on Home Run Derby', 'description': 'md5:b80b34031143d0986dddc64a8839f0fb', 'duration': 52, - 'timestamp': 1405390722, + 'timestamp': 1405405122, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', - 'md5': 'aafaf5b0186fee8f32f20508092f8111', + 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', + 'md5': 'e09e37b552351fddbf4d9e699c924d68', 'info_dict': { 'id': '75609783', 'ext': 'mp4', 'title': 'Must C: Pillar climbs for catch', 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429124820, + 'timestamp': 1429139220, 'upload_date': '20150415', } }, @@ -111,7 +105,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', + 'url': 'https://www.mlb.com/cardinals/video/piscottys-great-sliding-catch/c-51175783', 'only_matching': True, }, { @@ -120,58 +114,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#', + 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', 'only_matching': True, } ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - if not video_id: - video_path = mobj.group('path') - webpage = self._download_webpage(url, video_path) - video_id = self._search_regex( - [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id') - - detail = self._download_xml( - 'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' - % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id) - - title = detail.find('./headline').text - description = detail.find('./big-blurb').text - duration = parse_duration(detail.find('./duration').text) - timestamp = parse_iso8601(detail.attrib['date'][:-5]) - - thumbnails = [{ - 'url': thumbnail.text, - } for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')] - - formats = [] - for media_url in detail.findall('./url'): - playback_scenario = media_url.attrib['playback_scenario'] - fmt = { - 'url': media_url.text, - 'format_id': playback_scenario, - } - m = re.search(r'(?P\d+)K_(?P\d+)X(?P\d+)', playback_scenario) - if m: - fmt.update({ - 'vbr': int(m.group('vbr')) * 1000, - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - formats.append(fmt) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py deleted file mode 100644 index b208820fe..000000000 --- a/youtube_dl/extractor/moniker.py +++ /dev/null @@ -1,116 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import os.path -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - remove_start, - sanitized_Request, - urlencode_postdata, -) - - -class MonikerIE(InfoExtractor): - IE_DESC = 'allmyvideos.net and vidspot.net' - _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?:(?:2|v)/v-)?(?P[a-zA-Z0-9_-]+)' - - _TESTS = [{ - 'url': 'http://allmyvideos.net/jih3nce3x6wn', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'jih3nce3x6wn', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'http://allmyvideos.net/embed-jih3nce3x6wn', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'jih3nce3x6wn', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'http://vidspot.net/l2ngsmhs8ci5', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'l2ngsmhs8ci5', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'https://www.vidspot.net/l2ngsmhs8ci5', - 'only_matching': True, - }, { - 'url': 'http://vidspot.net/2/v-ywDf99', - 'md5': '5f8254ce12df30479428b0152fb8e7ba', - 'info_dict': { - 'id': 'ywDf99', - 'ext': 'mp4', - 'title': 'IL FAIT LE MALIN EN PORSHE CAYENNE ( mais pas pour longtemps)', - 'description': 'IL FAIT LE MALIN EN PORSHE CAYENNE.', - }, - }, { - 'url': 'http://allmyvideos.net/v/v-HXZm5t', - 'only_matching': True, - }] - - def _real_extract(self, url): - orig_video_id = self._match_id(url) - video_id = remove_start(orig_video_id, 'embed-') - url = url.replace(orig_video_id, video_id) - assert re.match(self._VALID_URL, url) is not None - orig_webpage = self._download_webpage(url, video_id) - - if '>File Not Found<' in orig_webpage: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - error = self._search_regex( - r'class="err">([^<]+)<', orig_webpage, 'error', default=None) - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), expected=True) - - builtin_url = self._search_regex( - r']+src=(["\'])(?P.+?/builtin-.+?)\1', - orig_webpage, 'builtin URL', default=None, group='url') - - if builtin_url: - req = sanitized_Request(builtin_url) - req.add_header('Referer', url) - webpage = self._download_webpage(req, video_id, 'Downloading builtin page') - title = self._og_search_title(orig_webpage).strip() - description = self._og_search_description(orig_webpage).strip() - else: - fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) - data = dict(fields) - - post = urlencode_postdata(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = sanitized_Request(url, post, headers) - webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') - - title = os.path.splitext(data['fname'])[0] - description = None - - # Could be several links with different quality - links = re.findall(r'"file" : "?(.+?)",', webpage) - # Assume the links are ordered in quality - formats = [{ - 'url': l, - 'quality': i, - } for i, l in enumerate(links)] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - } diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index e24396e79..bed5645f2 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -77,8 +77,11 @@ class MotherlessIE(InfoExtractor): title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') - video_url = self._html_search_regex( - r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL') + video_url = (self._html_search_regex( + (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', + r'fileurl\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'video URL', default=None, group='url') or + 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( r'Views\s+([^<]+)<', @@ -120,7 +123,7 @@ class MotherlessIE(InfoExtractor): class MotherlessGroupIE(InfoExtractor): - _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P[a-z0-9_]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P[a-z0-9_]+)' _TESTS = [{ 'url': 'http://motherless.com/g/movie_scenes', 'info_dict': { diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/mychannels.py similarity index 59% rename from youtube_dl/extractor/makerschannel.py rename to youtube_dl/extractor/mychannels.py index f5d00e61d..b1ffe7848 100644 --- a/youtube_dl/extractor/makerschannel.py +++ b/youtube_dl/extractor/mychannels.py @@ -6,17 +6,17 @@ import re from .common import InfoExtractor -class MakersChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?Pvideo|production)_id=(?P[0-9]+)' +class MyChannelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mychannels\.com/.*(?Pvideo|production)_id=(?P[0-9]+)' _TEST = { - 'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849', - 'md5': '624a512c6969236b5967bf9286345ad1', + 'url': 'https://mychannels.com/missholland/miss-holland?production_id=3416', + 'md5': 'b8993daad4262dd68d89d651c0c52c45', 'info_dict': { - 'id': '849', + 'id': 'wUUDZZep6vQD', 'ext': 'mp4', - 'title': 'Landing a bus on a plane is an epic win', - 'uploader': 'ZoomIn', - 'description': 'md5:cd9cca2ea7b69b78be81d07020c97139', + 'title': 'Miss Holland joins VOTE LEAVE', + 'description': 'Miss Holland | #13 Not a potato', + 'uploader': 'Miss Holland', } } @@ -27,12 +27,12 @@ class MakersChannelIE(InfoExtractor): def extract_data_val(attr, fatal=False): return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) - minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') + minoto_id = extract_data_val('minoto-id') or self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') return { '_type': 'url_transparent', 'url': 'minoto:%s' % minoto_id, - 'id': extract_data_val('video-id', True), + 'id': url_id, 'title': extract_data_val('title', True), 'description': extract_data_val('description'), 'thumbnail': extract_data_val('image'), diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 1b1722cfa..c843f8649 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,7 +1,8 @@ from __future__ import unicode_literals -import re import base64 +import json +import re from .common import InfoExtractor from .theplatform import ThePlatformIE @@ -175,6 +176,65 @@ class NBCSportsIE(InfoExtractor): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') +class NBCSportsStreamIE(AdobePassIE): + _VALID_URL = r'https?://stream\.nbcsports\.com/.+?\bpid=(?P\d+)' + _TEST = { + 'url': 'http://stream.nbcsports.com/nbcsn/generic?pid=206559', + 'info_dict': { + 'id': '206559', + 'ext': 'mp4', + 'title': 'Amgen Tour of California Women\'s Recap', + 'description': 'md5:66520066b3b5281ada7698d0ea2aa894', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Requires Adobe Pass Authentication', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + live_source = self._download_json( + 'http://stream.nbcsports.com/data/live_sources_%s.json' % video_id, + video_id) + video_source = live_source['videoSources'][0] + title = video_source['title'] + source_url = None + for k in ('source', 'msl4source', 'iossource', 'hlsv4'): + sk = k + 'Url' + source_url = video_source.get(sk) or video_source.get(sk + 'Alt') + if source_url: + break + else: + source_url = video_source['ottStreamUrl'] + is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live' + resource = self._get_mvpd_resource('nbcsports', title, video_id, '') + token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource) + tokenized_url = self._download_json( + 'https://token.playmakerservices.com/cdn', + video_id, data=json.dumps({ + 'requestorId': 'nbcsports', + 'pid': video_id, + 'application': 'NBCSports', + 'version': 'v1', + 'platform': 'desktop', + 'cdn': 'akamai', + 'url': video_source['sourceUrl'], + 'token': base64.b64encode(token.encode()).decode(), + 'resourceId': base64.b64encode(resource.encode()).decode(), + }).encode())['tokenizedUrl'] + formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': live_source.get('description'), + 'formats': formats, + 'is_live': is_live, + } + + class CSNNEIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 5e46a75c0..82d526c22 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -29,14 +29,13 @@ class NexxIE(InfoExtractor): _TESTS = [{ # movie 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', - 'md5': '828cea195be04e66057b846288295ba1', + 'md5': '31899fd683de49ad46f4ee67e53e83fe', 'info_dict': { 'id': '128907', 'ext': 'mp4', 'title': 'Stiftung Warentest', 'alt_title': 'Wie ein Test abläuft', 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', - 'release_year': 2013, 'creator': 'SPIEGEL TV', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2509, @@ -62,6 +61,7 @@ class NexxIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { # does not work via arc 'url': 'nexx:741:1269984', @@ -71,12 +71,26 @@ class NexxIE(InfoExtractor): 'ext': 'mp4', 'title': '1 TAG ohne KLO... wortwörtlich! 😑', 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', - 'description': 'md5:4604539793c49eda9443ab5c5b1d612f', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 607, 'timestamp': 1518614955, 'upload_date': '20180214', }, + }, { + # free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html + 'url': 'nexx:747:1533779', + 'md5': '6bf6883912b82b7069fb86c2297e9893', + 'info_dict': { + 'id': '1533779', + 'ext': 'mp4', + 'title': 'Aufregung um ausgebrochene Raubtiere', + 'alt_title': 'Eifel-Zoo', + 'description': 'md5:f21375c91c74ad741dcb164c427999d2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 111, + 'timestamp': 1527874460, + 'upload_date': '20180601', + }, }, { 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'only_matching': True, @@ -141,6 +155,139 @@ class NexxIE(InfoExtractor): self._handle_error(result) return result['result'] + def _extract_free_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'free' + + hash = video['general']['hash'] + + ps = compat_str(stream_data['originalDomain']) + if stream_data['applyFolderHierarchy'] == 1: + s = ('%04d' % int(video_id))[::-1] + ps += '/%s/%s' % (s[0:2], s[2:4]) + ps += '/%s/%s_' % (video_id, hash) + + t = 'http://%s' + ps + fd = stream_data['azureFileDistribution'].split(',') + cdn_provider = stream_data['cdnProvider'] + + def p0(p): + return '_%s' % p if stream_data['applyAzureStructure'] == 1 else '' + + formats = [] + if cdn_provider == 'ak': + t += ',' + for i in fd: + p = i.split(':') + t += p[1] + p0(int(p[0])) + ',' + t += '.mp4.csmil/master.%s' + elif cdn_provider == 'ce': + k = t.split('/') + h = k.pop() + http_base = t = '/'.join(k) + http_base = http_base % stream_data['cdnPathHTTP'] + t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream=' + for i in fd: + p = i.split(':') + tbr = int(p[0]) + filename = '%s%s%s.mp4' % (h, p[1], p0(tbr)) + f = { + 'url': http_base + '/' + filename, + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = p[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + a = filename + ':%s' % (tbr * 1000) + t += a + ',' + t = t[:-1] + '&audiostream=' + a.split(':')[0] + else: + assert False + + if cdn_provider == 'ce': + formats.extend(self._extract_mpd_formats( + t % (stream_data['cdnPathDASH'], 'mpd'), video_id, + mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_m3u8_formats( + t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False)) + + return formats + + def _extract_azure_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'azure' + + azure_locator = stream_data['azureLocator'] + + def get_cdn_shield_base(shield_type='', static=False): + for secure in ('', 's'): + cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + if cdn_shield: + return 'http%s://%s' % (secure, cdn_shield) + else: + if 'fb' in stream_data['azureAccount']: + prefix = 'df' if static else 'f' + else: + prefix = 'd' if static else 'p' + account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) + return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) + + language = video['general'].get('language_raw') or '' + + azure_stream_base = get_cdn_shield_base() + is_ml = ',' in language + azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( + azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' + + protection_token = try_get( + video, lambda x: x['protectiondata']['token'], compat_str) + if protection_token: + azure_manifest_url += '?hdnts=%s' % protection_token + + formats = self._extract_m3u8_formats( + azure_manifest_url % '(format=m3u8-aapl)', + video_id, 'mp4', 'm3u8_native', + m3u8_id='%s-hls' % cdn, fatal=False) + formats.extend(self._extract_mpd_formats( + azure_manifest_url % '(format=mpd-time-csf)', + video_id, mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_ism_formats( + azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) + + azure_progressive_base = get_cdn_shield_base('Prog', True) + azure_file_distribution = stream_data.get('azureFileDistribution') + if azure_file_distribution: + fds = azure_file_distribution.split(',') + if fds: + for fd in fds: + ss = fd.split(':') + if len(ss) == 2: + tbr = int_or_none(ss[0]) + if tbr: + f = { + 'url': '%s%s/%s_src_%s_%d.mp4' % ( + azure_progressive_base, azure_locator, video_id, ss[1], tbr), + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = ss[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + + return formats + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) domain_id = mobj.group('domain_id') or mobj.group('domain_id_s') @@ -220,72 +367,15 @@ class NexxIE(InfoExtractor): general = video['general'] title = general['title'] - stream_data = video['streamdata'] - language = general.get('language_raw') or '' + cdn = video['streamdata']['cdnType'] - # TODO: reverse more cdns - - cdn = stream_data['cdnType'] - assert cdn == 'azure' - - azure_locator = stream_data['azureLocator'] - - def get_cdn_shield_base(shield_type='', static=False): - for secure in ('', 's'): - cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) - if cdn_shield: - return 'http%s://%s' % (secure, cdn_shield) - else: - if 'fb' in stream_data['azureAccount']: - prefix = 'df' if static else 'f' - else: - prefix = 'd' if static else 'p' - account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) - return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) - - azure_stream_base = get_cdn_shield_base() - is_ml = ',' in language - azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( - azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' - - protection_token = try_get( - video, lambda x: x['protectiondata']['token'], compat_str) - if protection_token: - azure_manifest_url += '?hdnts=%s' % protection_token - - formats = self._extract_m3u8_formats( - azure_manifest_url % '(format=m3u8-aapl)', - video_id, 'mp4', 'm3u8_native', - m3u8_id='%s-hls' % cdn, fatal=False) - formats.extend(self._extract_mpd_formats( - azure_manifest_url % '(format=mpd-time-csf)', - video_id, mpd_id='%s-dash' % cdn, fatal=False)) - formats.extend(self._extract_ism_formats( - azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) - - azure_progressive_base = get_cdn_shield_base('Prog', True) - azure_file_distribution = stream_data.get('azureFileDistribution') - if azure_file_distribution: - fds = azure_file_distribution.split(',') - if fds: - for fd in fds: - ss = fd.split(':') - if len(ss) == 2: - tbr = int_or_none(ss[0]) - if tbr: - f = { - 'url': '%s%s/%s_src_%s_%d.mp4' % ( - azure_progressive_base, azure_locator, video_id, ss[1], tbr), - 'format_id': '%s-http-%d' % (cdn, tbr), - 'tbr': tbr, - } - width_height = ss[1].split('x') - if len(width_height) == 2: - f.update({ - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - }) - formats.append(f) + if cdn == 'azure': + formats = self._extract_azure_formats(video, video_id) + elif cdn == 'free': + formats = self._extract_free_formats(video, video_id) + else: + # TODO: reverse more cdns + assert False self._sort_formats(formats) diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 62ce800c0..cf440f713 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -1,18 +1,10 @@ from __future__ import unicode_literals import re -import json -import os from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( - unified_strdate, determine_ext, int_or_none, parse_iso8601, @@ -20,236 +12,77 @@ from ..utils import ( ) -class NHLBaseInfoExtractor(InfoExtractor): - @staticmethod - def _fix_json(json_string): - return json_string.replace('\\\'', '\'') +class NHLBaseIE(InfoExtractor): + def _real_extract(self, url): + site, tmp_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'https://%s/%s/%sid/v1/%s/details/web-v1.json' + % (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id) + if video_data.get('type') != 'video': + video_data = video_data['media'] + video = video_data.get('video') + if video: + video_data = video + else: + videos = video_data.get('videos') + if videos: + video_data = videos[0] - def _real_extract_video(self, video_id): - vid_parts = video_id.split(',') - if len(vid_parts) == 3: - video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0')) - json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id - data = self._download_json( - json_url, video_id, transform_source=self._fix_json) - return self._extract_video(data[0]) + video_id = compat_str(video_data['id']) + title = video_data['title'] - def _extract_video(self, info): - video_id = info['id'] - self.report_extraction(video_id) + formats = [] + for playback in video_data.get('playbacks', []): + playback_url = playback.get('url') + if not playback_url: + continue + ext = determine_ext(playback_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=playback.get('name', 'hls'), fatal=False) + self._check_formats(m3u8_formats, video_id) + formats.extend(m3u8_formats) + else: + height = int_or_none(playback.get('height')) + formats.append({ + 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), + 'url': playback_url, + 'width': int_or_none(playback.get('width')), + 'height': height, + 'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)), + }) + self._sort_formats(formats) - initial_video_url = info['publishPoint'] - if info['formats'] == '1': - parsed_url = compat_urllib_parse_urlparse(initial_video_url) - filename, ext = os.path.splitext(parsed_url.path) - path = '%s_sd%s' % (filename, ext) - data = compat_urllib_parse_urlencode({ - 'type': 'fvod', - 'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:]) + thumbnails = [] + cuts = video_data.get('image', {}).get('cuts') or [] + if isinstance(cuts, dict): + cuts = cuts.values() + for thumbnail_data in cuts: + thumbnail_url = thumbnail_data.get('src') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_data.get('width')), + 'height': int_or_none(thumbnail_data.get('height')), }) - path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data - path_doc = self._download_xml( - path_url, video_id, 'Downloading final video url') - video_url = path_doc.find('path').text - else: - video_url = initial_video_url - - join = compat_urlparse.urljoin - ret = { - 'id': video_id, - 'title': info['name'], - 'url': video_url, - 'description': info['description'], - 'duration': int(info['duration']), - 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), - 'upload_date': unified_strdate(info['releaseDate'].split('.')[0]), - } - if video_url.startswith('rtmp:'): - mobj = re.match(r'(?Prtmp://[^/]+/(?P[a-z0-9/]+))/(?Pmp4:.*)', video_url) - ret.update({ - 'tc_url': mobj.group('tc_url'), - 'play_path': mobj.group('play_path'), - 'app': mobj.group('app'), - 'no_resume': True, - }) - return ret - - -class NHLVideocenterIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter' - _VALID_URL = r'https?://video(?P\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P[-0-9a-zA-Z,]+)' - - _TESTS = [{ - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', - 'md5': 'db704a4ea09e8d3988c85e36cc892d09', - 'info_dict': { - 'id': '453614', - 'ext': 'mp4', - 'title': 'Quick clip: Weise 4-3 goal vs Flames', - 'description': 'Dale Weise scores his first of the season to put the Canucks up 4-3.', - 'duration': 18, - 'upload_date': '20131006', - }, - }, { - 'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h', - 'md5': 'd22e82bc592f52d37d24b03531ee9696', - 'info_dict': { - 'id': '2014020024-628-h', - 'ext': 'mp4', - 'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)', - 'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014', - 'duration': 0, - 'upload_date': '20141011', - }, - }, { - 'url': 'http://video.mapleleafs.nhl.com/videocenter/console?id=58665&catid=802', - 'md5': 'c78fc64ea01777e426cfc202b746c825', - 'info_dict': { - 'id': '58665', - 'ext': 'flv', - 'title': 'Classic Game In Six - April 22, 1979', - 'description': 'It was the last playoff game for the Leafs in the decade, and the last time the Leafs and Habs played in the playoffs. Great game, not a great ending.', - 'duration': 400, - 'upload_date': '20100129' - }, - }, { - 'url': 'http://video.flames.nhl.com/videocenter/console?id=630616', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/?id=736722', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en', - 'md5': '076fcb88c255154aacbf0a7accc3f340', - 'info_dict': { - 'id': '2014020299-X-h', - 'ext': 'mp4', - 'title': 'Penguins at Islanders / Game Highlights', - 'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014', - 'duration': 268, - 'upload_date': '20141122', - } - }, { - 'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4', - 'info_dict': { - 'id': '691469', - 'ext': 'mp4', - 'title': 'RAW | Craig MacTavish Full Press Conference', - 'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.', - 'upload_date': '20141205', - }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } - }, { - 'url': 'http://video.nhl.com/videocenter/embed?playlist=836127', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._real_extract_video(video_id) - - -class NHLNewsIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:news' - IE_DESC = 'NHL news' - _VALID_URL = r'https?://(?:.+?\.)?nhl\.com/(?:ice|club)/news\.html?(?:\?(?:.*?[?&])?)id=(?P[-0-9a-zA-Z]+)' - - _TESTS = [{ - 'url': 'http://www.nhl.com/ice/news.htm?id=750727', - 'md5': '4b3d1262e177687a3009937bd9ec0be8', - 'info_dict': { - 'id': '736722', - 'ext': 'mp4', - 'title': 'Cal Clutterbuck has been fined $2,000', - 'description': 'md5:45fe547d30edab88b23e0dd0ab1ed9e6', - 'duration': 37, - 'upload_date': '20150128', - }, - }, { - # iframe embed - 'url': 'http://sabres.nhl.com/club/news.htm?id=780189', - 'md5': '9f663d1c006c90ac9fb82777d4294e12', - 'info_dict': { - 'id': '836127', - 'ext': 'mp4', - 'title': 'Morning Skate: OTT vs. BUF (9/23/15)', - 'description': "Brian Duff chats with Tyler Ennis prior to Buffalo's first preseason home game.", - 'duration': 93, - 'upload_date': '20150923', - }, - }] - - def _real_extract(self, url): - news_id = self._match_id(url) - webpage = self._download_webpage(url, news_id) - video_id = self._search_regex( - [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'", - r']+src=["\']https?://video.*?\.nhl\.com/videocenter/embed\?.*\bplaylist=(\d+)'], - webpage, 'video id') - return self._real_extract_video(video_id) - - -class NHLVideocenterCategoryIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter:category' - IE_DESC = 'NHL videocenter category' - _VALID_URL = r'https?://video\.(?P[^.]*)\.nhl\.com/videocenter/(console\?[^(id=)]*catid=(?P[0-9]+)(?![&?]id=).*?)?$' - _TEST = { - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=999', - 'info_dict': { - 'id': '999', - 'title': 'Highlights', - }, - 'playlist_count': 12, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - team = mobj.group('team') - webpage = self._download_webpage(url, team) - cat_id = self._search_regex( - [r'var defaultCatId = "(.+?)";', - r'{statusIndex:0,index:0,.*?id:(.*?),'], - webpage, 'category id') - playlist_title = self._html_search_regex( - r'tab0"[^>]*?>(.*?)', - webpage, 'playlist title', flags=re.DOTALL).lower().capitalize() - - data = compat_urllib_parse_urlencode({ - 'cid': cat_id, - # This is the default value - 'count': 12, - 'ptrs': 3, - 'format': 'json', - }) - path = '/videocenter/servlets/browse?' + data - request_url = compat_urlparse.urljoin(url, path) - response = self._download_webpage(request_url, playlist_title) - response = self._fix_json(response) - if not response.strip(): - self._downloader.report_warning('Got an empty response, trying ' - 'adding the "newvideos" parameter') - response = self._download_webpage(request_url + '&newvideos=true', - playlist_title) - response = self._fix_json(response) - videos = json.loads(response) return { - '_type': 'playlist', - 'title': playlist_title, - 'id': cat_id, - 'entries': [self._extract_video(v) for v in videos], + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, } -class NHLIE(InfoExtractor): +class NHLIE(NHLBaseIE): IE_NAME = 'nhl.com' _VALID_URL = r'https?://(?:www\.)?(?Pnhl|wch2016)\.com/(?:[^/]+/)*c-(?P\d+)' - _SITES_MAP = { - 'nhl': 'nhl', - 'wch2016': 'wch', - } + _CONTENT_DOMAIN = 'nhl.bamcontent.com' _TESTS = [{ # type=video 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', @@ -293,59 +126,3 @@ class NHLIE(InfoExtractor): 'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068', 'only_matching': True, }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tmp_id, site = mobj.group('id'), mobj.group('site') - video_data = self._download_json( - 'https://nhl.bamcontent.com/%s/id/v1/%s/details/web-v1.json' - % (self._SITES_MAP[site], tmp_id), tmp_id) - if video_data.get('type') == 'article': - video_data = video_data['media'] - - video_id = compat_str(video_data['id']) - title = video_data['title'] - - formats = [] - for playback in video_data.get('playbacks', []): - playback_url = playback.get('url') - if not playback_url: - continue - ext = determine_ext(playback_url) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=playback.get('name', 'hls'), fatal=False) - self._check_formats(m3u8_formats, video_id) - formats.extend(m3u8_formats) - else: - height = int_or_none(playback.get('height')) - formats.append({ - 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), - 'url': playback_url, - 'width': int_or_none(playback.get('width')), - 'height': height, - }) - self._sort_formats(formats, ('preference', 'width', 'height', 'tbr', 'format_id')) - - thumbnails = [] - for thumbnail_id, thumbnail_data in video_data.get('image', {}).get('cuts', {}).items(): - thumbnail_url = thumbnail_data.get('src') - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'width': int_or_none(thumbnail_data.get('width')), - 'height': int_or_none(thumbnail_data.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('date')), - 'duration': parse_duration(video_data.get('duration')), - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index df7f528be..dbe871f16 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -163,7 +163,7 @@ class NiconicoIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if not username: return True diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index 8961309fd..65754c5e7 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( parse_iso8601, float_or_none, @@ -13,38 +12,11 @@ from ..utils import ( ) -class NineCNineMediaBaseIE(InfoExtractor): - _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' - - -class NineCNineMediaStackIE(NineCNineMediaBaseIE): - IE_NAME = '9c9media:stack' - _GEO_COUNTRIES = ['CA'] - _VALID_URL = r'9c9media:stack:(?P[^:]+):(?P\d+):(?P\d+):(?P\d+)' - - def _real_extract(self, url): - destination_code, content_id, package_id, stack_id = re.match(self._VALID_URL, url).groups() - stack_base_url_template = self._API_BASE_TEMPLATE + 'contentpackages/%s/stacks/%s/manifest.' - stack_base_url = stack_base_url_template % (destination_code, content_id, package_id, stack_id) - - formats = [] - formats.extend(self._extract_m3u8_formats( - stack_base_url + 'm3u8', stack_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - stack_base_url + 'f4m', stack_id, - f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - return { - 'id': stack_id, - 'formats': formats, - } - - -class NineCNineMediaIE(NineCNineMediaBaseIE): +class NineCNineMediaIE(InfoExtractor): IE_NAME = '9c9media' + _GEO_COUNTRIES = ['CA'] _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' + _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' def _real_extract(self, url): destination_code, content_id = re.match(self._VALID_URL, url).groups() @@ -58,13 +30,26 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): content_package = content['ContentPackages'][0] package_id = content_package['Id'] content_package_url = api_base_url + 'contentpackages/%s/' % package_id - content_package = self._download_json(content_package_url, content_id) + content_package = self._download_json( + content_package_url, content_id, query={ + '$include': '[HasClosedCaptions]', + }) - if content_package.get('Constraints', {}).get('Security', {}).get('Type') == 'adobe-drm': + if content_package.get('Constraints', {}).get('Security', {}).get('Type'): raise ExtractorError('This video is DRM protected.', expected=True) - stacks = self._download_json(content_package_url + 'stacks/', package_id)['Items'] - multistacks = len(stacks) > 1 + manifest_base_url = content_package_url + 'manifest.' + formats = [] + formats.extend(self._extract_m3u8_formats( + manifest_base_url + 'm3u8', content_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_base_url + 'f4m', content_id, + f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + manifest_base_url + 'mpd', content_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) thumbnails = [] for image in content.get('Images', []): @@ -85,10 +70,12 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): continue container.append(e_name) - description = content.get('Desc') or content.get('ShortDesc') season = content.get('Season', {}) - base_info = { - 'description': description, + + info = { + 'id': content_id, + 'title': title, + 'description': content.get('Desc') or content.get('ShortDesc'), 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), 'episode_number': int_or_none(content.get('Episode')), 'season': season.get('Name'), @@ -97,26 +84,19 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): 'series': content.get('Media', {}).get('Name'), 'tags': tags, 'categories': categories, + 'duration': float_or_none(content_package.get('Duration')), + 'formats': formats, } - entries = [] - for stack in stacks: - stack_id = compat_str(stack['Id']) - entry = { - '_type': 'url_transparent', - 'url': '9c9media:stack:%s:%s:%s:%s' % (destination_code, content_id, package_id, stack_id), - 'id': stack_id, - 'title': '%s_part%s' % (title, stack['Name']) if multistacks else title, - 'duration': float_or_none(stack.get('Duration')), - 'ie_key': 'NineCNineMediaStack', + if content_package.get('HasClosedCaptions'): + info['subtitles'] = { + 'en': [{ + 'url': manifest_base_url + 'vtt', + 'ext': 'vtt', + }, { + 'url': manifest_base_url + 'srt', + 'ext': 'srt', + }] } - entry.update(base_info) - entries.append(entry) - return { - '_type': 'multi_video', - 'id': content_id, - 'title': title, - 'description': description, - 'entries': entries, - } + return info diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index a9f9b10c4..58b371ed7 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -65,7 +65,7 @@ class NocoIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 06cb8cb3f..edf8b82c6 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -6,7 +6,9 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + js_to_json, unified_strdate, + url_or_none, ) @@ -111,8 +113,21 @@ class NovaIE(InfoExtractor): webpage, 'video id') config_url = self._search_regex( - r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', + r'src="(https?://(?:tn|api)\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', webpage, 'config url', default=None) + config_params = {} + + if not config_url: + player = self._parse_json( + self._search_regex( + r'(?s)Player\s*\(.+?\s*,\s*({.+?\bmedia\b["\']?\s*:\s*["\']?\d+.+?})\s*\)', webpage, + 'player', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if player: + config_url = url_or_none(player.get('configUrl')) + params = player.get('configParams') + if isinstance(params, dict): + config_params = params if not config_url: DEFAULT_SITE_ID = '23000' @@ -127,14 +142,20 @@ class NovaIE(InfoExtractor): } site_id = self._search_regex( - r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID) + r'site=(\d+)', webpage, 'site id', default=None) or SITES.get( + site, DEFAULT_SITE_ID) - config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig' - % (site_id, video_id)) + config_url = 'https://api.nova.cz/bin/player/videojs/config.php' + config_params = { + 'site': site_id, + 'media': video_id, + 'quality': 3, + 'version': 1, + } config = self._download_json( config_url, display_id, - 'Downloading config JSON', + 'Downloading config JSON', query=config_params, transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) mediafile = config['mediafile'] diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index ff2153387..c2cb85a73 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -36,8 +36,8 @@ class NPOIE(NPOBaseIE): https?:// (?:www\.)? (?: - npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}| - ntr\.nl/(?:[^/]+/){2,}| + npo\.nl/(?:[^/]+/)*| + (?:ntr|npostart)\.nl/(?:[^/]+/){2,}| omroepwnl\.nl/video/fragment/[^/]+__| (?:zapp|npo3)\.nl/(?:[^/]+/){2,} ) @@ -160,8 +160,20 @@ class NPOIE(NPOBaseIE): }, { 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927', 'only_matching': True, + }, { + 'url': 'https://www.npostart.nl/broodje-gezond-ei/28-05-2018/KN_1698996', + 'only_matching': True, + }, { + 'url': 'https://npo.nl/KN_1698996', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if any(ie.suitable(url) + for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE)) + else super(NPOIE, cls).suitable(url)) + def _real_extract(self, url): video_id = self._match_id(url) return self._get_info(video_id) @@ -270,7 +282,7 @@ class NPOIE(NPOBaseIE): video_url = stream_info.get('url') if not video_url or video_url in urls: continue - urls.add(item_url) + urls.add(video_url) if determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, ext='mp4', @@ -389,7 +401,7 @@ class NPOLiveIE(NPOBaseIE): class NPORadioIE(InfoExtractor): IE_NAME = 'npo.nl:radio' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P[^/]+)' _TEST = { 'url': 'http://www.npo.nl/radio/radio-1', @@ -404,6 +416,10 @@ class NPORadioIE(InfoExtractor): } } + @classmethod + def suitable(cls, url): + return False if NPORadioFragmentIE.suitable(url) else super(NPORadioIE, cls).suitable(url) + @staticmethod def _html_get_attribute_regex(attribute): return r'{0}\s*=\s*\'([^\']+)\''.format(attribute) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 3b4f51f61..a231735fb 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,24 +4,40 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( ExtractorError, int_or_none, + JSON_LD_RE, + NO_DEFAULT, parse_age_limit, parse_duration, + try_get, ) class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + _api_host = None + def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://%s/mediaelement/%s' % (self._API_HOST, video_id), - video_id, 'Downloading mediaelement JSON') + api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS + + for api_host in api_hosts: + data = self._download_json( + 'http://%s/mediaelement/%s' % (api_host, video_id), + video_id, 'Downloading mediaelement JSON', + fatal=api_host == api_hosts[-1]) + if not data: + continue + self._api_host = api_host + break title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id @@ -191,7 +207,7 @@ class NRKIE(NRKBaseIE): ) (?P[^?#&]+) ''' - _API_HOST = 'v8-psapi.nrk.no' + _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', @@ -237,8 +253,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-we.nrk.no' - + _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '4e9ca6629f09e588ed240fb11619922a', @@ -350,6 +365,182 @@ class NRKTVIE(NRKBaseIE): }] +class NRKTVEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/\d+/episode/\d+)' + _TEST = { + 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', + 'info_dict': { + 'id': 'MSUI14000816AA', + 'ext': 'mp4', + 'title': 'Backstage 8:30', + 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', + 'duration': 1320, + 'series': 'Backstage', + 'season_number': 1, + 'episode_number': 8, + 'episode': '8:30', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + nrk_id = self._parse_json( + self._search_regex(JSON_LD_RE, webpage, 'JSON-LD', group='json_ld'), + display_id)['@id'] + + assert re.match(NRKTVIE._EPISODE_RE, nrk_id) + return self.url_result( + 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id) + + +class NRKTVSerieBaseIE(InfoExtractor): + def _extract_series(self, webpage, display_id, fatal=True): + config = self._parse_json( + self._search_regex( + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*', webpage, 'config', + default='{}' if not fatal else NO_DEFAULT), + display_id, fatal=False) + if not config: + return + return try_get(config, lambda x: x['series'], dict) + + def _extract_episodes(self, season): + entries = [] + if not isinstance(season, dict): + return entries + episodes = season.get('episodes') + if not isinstance(episodes, list): + return entries + for episode in episodes: + nrk_id = episode.get('prfId') + if not nrk_id or not isinstance(nrk_id, compat_str): + continue + entries.append(self.url_result( + 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) + return entries + + +class NRKTVSeasonIE(NRKTVSerieBaseIE): + _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P\d+)' + _TEST = { + 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', + 'info_dict': { + 'id': '1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 30, + } + + @classmethod + def suitable(cls, url): + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + else super(NRKTVSeasonIE, cls).suitable(url)) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + series = self._extract_series(webpage, display_id) + + season = next( + s for s in series['seasons'] + if int(display_id) == s.get('seasonNumber')) + + title = try_get(season, lambda x: x['titles']['title'], compat_str) + return self.playlist_result( + self._extract_episodes(season), display_id, title) + + +class NRKTVSeriesIE(NRKTVSerieBaseIE): + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' + _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' + _TESTS = [{ + # new layout + 'url': 'https://tv.nrk.no/serie/backstage', + 'info_dict': { + 'id': 'backstage', + 'title': 'Backstage', + 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', + }, + 'playlist_mincount': 60, + }, { + # old layout + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 9, + }, { + 'url': 'http://tv.nrksuper.no/serie/labyrint', + 'info_dict': { + 'id': 'labyrint', + 'title': 'Labyrint', + 'description': 'md5:58afd450974c89e27d5a19212eee7115', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/saving-the-human-race', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/postmann-pat', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return ( + False if any(ie.suitable(url) + for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE)) + else super(NRKTVSeriesIE, cls).suitable(url)) + + def _real_extract(self, url): + series_id = self._match_id(url) + + webpage = self._download_webpage(url, series_id) + + # New layout (e.g. https://tv.nrk.no/serie/backstage) + series = self._extract_series(webpage, series_id, fatal=False) + if series: + title = try_get(series, lambda x: x['titles']['title'], compat_str) + description = try_get( + series, lambda x: x['titles']['subtitle'], compat_str) + entries = [] + for season in series['seasons']: + entries.extend(self._extract_episodes(season)) + return self.playlist_result(entries, series_id, title, description) + + # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede) + entries = [ + self.url_result( + 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( + series=series_id, season=season_id)) + for season_id in re.findall(self._ITEM_RE, webpage) + ] + + title = self._html_search_meta( + 'seriestitle', webpage, + 'title', default=None) or self._og_search_title( + webpage, fatal=False) + + description = self._html_search_meta( + 'series_description', webpage, + 'description', default=None) or self._og_search_description(webpage) + + return self.playlist_result(entries, series_id, title, description) + + class NRKTVDirekteIE(NRKTVIE): IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P[^/?#&]+)' @@ -429,64 +620,6 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE): r'

([^<]+)

', webpage, 'title', fatal=False) -class NRKTVSeriesIE(InfoExtractor): - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' - _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' - _TESTS = [{ - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'info_dict': { - 'id': 'groenn-glede', - 'title': 'Grønn glede', - 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', - }, - 'playlist_mincount': 9, - }, { - 'url': 'http://tv.nrksuper.no/serie/labyrint', - 'info_dict': { - 'id': 'labyrint', - 'title': 'Labyrint', - 'description': 'md5:58afd450974c89e27d5a19212eee7115', - }, - 'playlist_mincount': 3, - }, { - 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/saving-the-human-race', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/postmann-pat', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url) - - def _real_extract(self, url): - series_id = self._match_id(url) - - webpage = self._download_webpage(url, series_id) - - entries = [ - self.url_result( - 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, season=season_id)) - for season_id in re.findall(self._ITEM_RE, webpage) - ] - - title = self._html_search_meta( - 'seriestitle', webpage, - 'title', default=None) or self._og_search_title( - webpage, fatal=False) - - description = self._html_search_meta( - 'series_description', webpage, - 'description', default=None) or self._og_search_description(webpage) - - return self.playlist_result(entries, series_id, title, description) - - class NRKSkoleIE(InfoExtractor): IE_DESC = 'NRK Skole' _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P\d+)' diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d0bdd60b8..d264fe206 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -301,6 +301,16 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', 'only_matching': True, + }, { + 'url': 'https://oload.win/f/kUEfGclsU9o', + 'only_matching': True, + }, { + 'url': 'https://oload.download/f/kUEfGclsU9o', + 'only_matching': True, + }, { + # Its title has not got its extension but url has it + 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' @@ -362,8 +372,7 @@ class OpenloadIE(InfoExtractor): 'title': title, 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'url': video_url, - # Seems all videos have extensions in their titles - 'ext': determine_ext(title, 'mp4'), + 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'), 'subtitles': subtitles, 'http_headers': headers, } diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 8ed3c6347..56a2a1083 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -42,7 +42,7 @@ class PacktPubIE(PacktPubBaseIE): _TOKEN = None def _real_initialize(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return try: diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index d4b1d34ca..9eb027679 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -53,7 +53,7 @@ class PatreonIE(InfoExtractor): # needed. Keeping this commented for when this inevitably changes. ''' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f11d5da52..80340f595 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -14,6 +15,7 @@ from ..utils import ( strip_jsonp, strip_or_none, unified_strdate, + url_or_none, US_RATINGS, ) @@ -360,6 +362,50 @@ class PBSIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/', + 'info_dict': { + 'id': '2365936247', + 'ext': 'mp4', + 'title': 'Antiques Roadshow - Indianapolis, Hour 2', + 'description': 'md5:524b32249db55663e7231b6b8d1671a2', + 'duration': 3180, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', + 'info_dict': { + 'id': '3007193718', + 'ext': 'mp4', + 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", + 'description': 'md5:37efbac85e0c09b009586523ec143652', + 'duration': 6292, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', + 'info_dict': { + 'id': '3011407934', + 'ext': 'mp4', + 'title': 'Stories from the Stage - Road Trip', + 'duration': 1619, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -422,6 +468,8 @@ class PBSIE(InfoExtractor): r']+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", + r']+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ + r']+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ ] media_id = self._search_regex( @@ -456,7 +504,8 @@ class PBSIE(InfoExtractor): if not url: url = self._og_search_url(webpage) - mobj = re.match(self._VALID_URL, url) + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) player_id = mobj.group('player_id') if not display_id: @@ -466,13 +515,27 @@ class PBSIE(InfoExtractor): url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( - r'[^:]+):| + https?://(?P%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ + ) + (?P%s) + ''' % (_INSTANCES_RE, _UUID_RE) + _TESTS = [{ + 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'md5': '80f24ff364cc9d333529506a263e7feb', + 'info_dict': { + 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'ext': 'mp4', + 'title': 'wow', + 'description': 'wow such video, so gif', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'timestamp': 1519297480, + 'upload_date': '20180222', + 'uploader': 'Luclu7', + 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', + 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', + 'license': 'Unknown', + 'duration': 3, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': list, + 'categories': list, + } + }, { + 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'only_matching': True, + }, { + # nsfw + 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'only_matching': True, + }, { + 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'only_matching': True, + }, { + 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'only_matching': True, + }, { + 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, + }] + + @staticmethod + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P[^/]+)/videos/watch/(?P%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + 'PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' + % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) + if not entries: + peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) + if peertube_url: + entries = [peertube_url] + return entries + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') + + video = self._download_json( + 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) + + title = video['name'] + + formats = [] + for file_ in video['files']: + if not isinstance(file_, dict): + continue + file_url = url_or_none(file_.get('fileUrl')) + if not file_url: + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + formats.append(f) + self._sort_formats(formats) + + def account_data(field): + return try_get(video, lambda x: x['account'][field], compat_str) + + category = try_get(video, lambda x: x['category']['label'], compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName'), + 'uploader_id': account_data('uuid'), + 'uploder_url': account_data('url'), + 'license': try_get( + video, lambda x: x['licence']['label'], compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index aacc5d4bb..1257841e4 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -27,6 +27,60 @@ from ..utils import ( class PluralsightBaseIE(InfoExtractor): _API_BASE = 'https://app.pluralsight.com' + _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE + _GRAPHQL_HEADERS = { + 'Content-Type': 'application/json;charset=UTF-8', + } + _GRAPHQL_COURSE_TMPL = ''' +query BootstrapPlayer { + rpc { + bootstrapPlayer { + profile { + firstName + lastName + email + username + userHandle + authed + isAuthed + plan + } + course(courseId: "%s") { + name + title + courseHasCaptions + translationLanguages { + code + name + } + supportsWideScreenVideoFormats + timestamp + modules { + name + title + duration + formattedDuration + author + authorized + clips { + authorized + clipId + duration + formattedDuration + id + index + moduleIndex + moduleTitle + name + title + watched + } + } + } + } + } +}''' + def _download_course(self, course_id, url, display_id): try: return self._download_course_rpc(course_id, url, display_id) @@ -39,20 +93,14 @@ class PluralsightBaseIE(InfoExtractor): def _download_course_rpc(self, course_id, url, display_id): response = self._download_json( - '%s/player/functions/rpc' % self._API_BASE, display_id, - 'Downloading course JSON', - data=json.dumps({ - 'fn': 'bootstrapPlayer', - 'payload': { - 'courseId': course_id, - }, - }).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) + self._GRAPHQL_EP, display_id, data=json.dumps({ + 'query': self._GRAPHQL_COURSE_TMPL % course_id, + 'variables': {} + }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) - course = try_get(response, lambda x: x['payload']['course'], dict) + course = try_get( + response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], + dict) if course: return course @@ -90,11 +138,33 @@ class PluralsightIE(PluralsightBaseIE): 'only_matching': True, }] + GRAPHQL_VIEWCLIP_TMPL = ''' +query viewClip { + viewClip(input: { + author: "%(author)s", + clipIndex: %(clipIndex)d, + courseName: "%(courseName)s", + includeCaptions: %(includeCaptions)s, + locale: "%(locale)s", + mediaType: "%(mediaType)s", + moduleName: "%(moduleName)s", + quality: "%(quality)s" + }) { + urls { + url + cdn + rank + source + }, + status + } +}''' + def _real_initialize(self): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -140,10 +210,10 @@ class PluralsightIE(PluralsightBaseIE): raise ExtractorError('Unable to log in') - def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): + def _get_subtitles(self, author, clip_idx, lang, name, duration, video_id): captions_post = { 'a': author, - 'cn': clip_id, + 'cn': clip_idx, 'lc': lang, 'm': name, } @@ -195,13 +265,13 @@ class PluralsightIE(PluralsightBaseIE): author = qs.get('author', [None])[0] name = qs.get('name', [None])[0] - clip_id = qs.get('clip', [None])[0] + clip_idx = qs.get('clip', [None])[0] course_name = qs.get('course', [None])[0] - if any(not f for f in (author, name, clip_id, course_name,)): + if any(not f for f in (author, name, clip_idx, course_name,)): raise ExtractorError('Invalid URL', expected=True) - display_id = '%s-%s' % (name, clip_id) + display_id = '%s-%s' % (name, clip_idx) course = self._download_course(course_name, url, display_id) @@ -217,7 +287,7 @@ class PluralsightIE(PluralsightBaseIE): clip_index = clip_.get('index') if clip_index is None: continue - if compat_str(clip_index) == clip_id: + if compat_str(clip_index) == clip_idx: clip = clip_ break @@ -225,6 +295,7 @@ class PluralsightIE(PluralsightBaseIE): raise ExtractorError('Unable to resolve clip') title = clip['title'] + clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] QUALITIES = { 'low': {'width': 640, 'height': 480}, @@ -276,8 +347,8 @@ class PluralsightIE(PluralsightBaseIE): f = QUALITIES[quality].copy() clip_post = { 'author': author, - 'includeCaptions': False, - 'clipIndex': int(clip_id), + 'includeCaptions': 'false', + 'clipIndex': int(clip_idx), 'courseName': course_name, 'locale': 'en', 'moduleName': name, @@ -285,11 +356,23 @@ class PluralsightIE(PluralsightBaseIE): 'quality': '%dx%d' % (f['width'], f['height']), } format_id = '%s-%s' % (ext, quality) - viewclip = self._download_json( - '%s/video/clips/viewclip' % self._API_BASE, display_id, - 'Downloading %s viewclip JSON' % format_id, fatal=False, - data=json.dumps(clip_post).encode('utf-8'), - headers={'Content-Type': 'application/json;charset=utf-8'}) + + try: + viewclip = self._download_json( + self._GRAPHQL_EP, display_id, + 'Downloading %s viewclip graphql' % format_id, + data=json.dumps({ + 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, + 'variables': {} + }).encode('utf-8'), + headers=self._GRAPHQL_HEADERS)['data']['viewClip'] + except ExtractorError: + # Still works but most likely will go soon + viewclip = self._download_json( + '%s/video/clips/viewclip' % self._API_BASE, display_id, + 'Downloading %s viewclip JSON' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) # Pluralsight tracks multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see @@ -330,10 +413,10 @@ class PluralsightIE(PluralsightBaseIE): # TODO: other languages? subtitles = self.extract_subtitles( - author, clip_id, 'en', name, duration, display_id) + author, clip_idx, 'en', name, duration, display_id) return { - 'id': clip.get('clipName') or clip['name'], + 'id': clip_id, 'title': title, 'duration': duration, 'creator': author, diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py index 60ade06da..5726cab3a 100644 --- a/youtube_dl/extractor/porncom.py +++ b/youtube_dl/extractor/porncom.py @@ -43,7 +43,8 @@ class PornComIE(InfoExtractor): config = self._parse_json( self._search_regex( - r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', + (r'=\s*({.+?})\s*;\s*v1ar\b', + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), webpage, 'config', default='{}'), display_id, transform_source=js_to_json, fatal=False) @@ -69,7 +70,7 @@ class PornComIE(InfoExtractor): 'height': int(height), 'filesize_approx': parse_filesize(filesize), } for format_url, height, filesize in re.findall( - r'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<', + r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<', webpage)] thumbnail = None duration = None diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 23e24d216..ffc4405a8 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -4,28 +4,22 @@ from __future__ import unicode_literals import functools import itertools import operator -# import os import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - # compat_urllib_parse_unquote, - # compat_urllib_parse_unquote_plus, - # compat_urllib_parse_urlparse, + compat_str, ) from ..utils import ( ExtractorError, int_or_none, js_to_json, orderedSet, - # sanitized_Request, remove_quotes, str_to_int, + url_or_none, ) -# from ..aes import ( -# aes_decrypt_text -# ) class PornHubIE(InfoExtractor): @@ -62,7 +56,7 @@ class PornHubIE(InfoExtractor): 'id': '1331683002', 'ext': 'mp4', 'title': '重庆婷婷女王足交', - 'uploader': 'cj397186295', + 'uploader': 'Unknown', 'duration': 1753, 'view_count': int, 'like_count': int, @@ -75,6 +69,31 @@ class PornHubIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # subtitles + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', + 'info_dict': { + 'id': 'ph5af5fef7c2aa7', + 'ext': 'mp4', + 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', + 'uploader': 'BFFs', + 'duration': 622, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'subtitles': { + 'en': [{ + "ext": 'srt' + }] + }, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, @@ -121,7 +140,7 @@ class PornHubIE(InfoExtractor): self._set_cookie('pornhub.com', 'platform', platform) return self._download_webpage( 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, - video_id) + video_id, 'Downloading %s webpage' % platform) webpage = dl_webpage('pc') @@ -134,57 +153,105 @@ class PornHubIE(InfoExtractor): 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) - tv_webpage = dl_webpage('tv') - - assignments = self._search_regex( - r'(var.+?mediastring.+?)</script>', tv_webpage, - 'encoded url').split(';') - - js_vars = {} - - def parse_js_value(inp): - inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) - if '+' in inp: - inps = inp.split('+') - return functools.reduce( - operator.concat, map(parse_js_value, inps)) - inp = inp.strip() - if inp in js_vars: - return js_vars[inp] - return remove_quotes(inp) - - for assn in assignments: - assn = assn.strip() - if not assn: - continue - assn = re.sub(r'var\s+', '', assn) - vname, value = assn.split('=', 1) - js_vars[vname] = parse_js_value(value) - - video_url = js_vars['mediastring'] - - title = self._search_regex( - r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) - # video_title from flashvars contains whitespace instead of non-ASCII (see # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. - title = title or self._html_search_meta( + title = self._html_search_meta( 'twitter:title', webpage, default=None) or self._search_regex( (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)', r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), webpage, 'title', group='title') + video_urls = [] + video_urls_set = set() + subtitles = {} + flashvars = self._parse_json( self._search_regex( r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), video_id) if flashvars: + subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) + if subtitle_url: + subtitles.setdefault('en', []).append({ + 'url': subtitle_url, + 'ext': 'srt', + }) thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) + media_definitions = flashvars.get('mediaDefinitions') + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if video_url in video_urls_set: + continue + video_urls_set.add(video_url) + video_urls.append( + (video_url, int_or_none(definition.get('quality')))) else: - title, thumbnail, duration = [None] * 3 + thumbnail, duration = [None] * 2 + + if not video_urls: + tv_webpage = dl_webpage('tv') + + assignments = self._search_regex( + r'(var.+?mediastring.+?)</script>', tv_webpage, + 'encoded url').split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + + video_url = js_vars['mediastring'] + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + for mobj in re.finditer( + r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage): + video_url = mobj.group('url') + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + formats = [] + for video_url, height in video_urls: + tbr = None + mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + self._sort_formats(formats) video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', @@ -210,7 +277,6 @@ class PornHubIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'uploader': video_uploader, 'title': title, 'thumbnail': thumbnail, @@ -219,10 +285,11 @@ class PornHubIE(InfoExtractor): 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - # 'formats': formats, + 'formats': formats, 'age_limit': 18, 'tags': tags, 'categories': categories, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py new file mode 100644 index 000000000..5465e8ab7 --- /dev/null +++ b/youtube_dl/extractor/puhutv.py @@ -0,0 +1,247 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + parse_resolution, + str_or_none, + try_get, + unified_timestamp, + url_or_none, + urljoin, +) + + +class PuhuTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle' + IE_NAME = 'puhutv' + _TESTS = [{ + # film + 'url': 'https://puhutv.com/sut-kardesler-izle', + 'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7', + 'info_dict': { + 'id': '5085', + 'display_id': 'sut-kardesler', + 'ext': 'mp4', + 'title': 'Süt Kardeşler', + 'description': 'md5:405fd024df916ca16731114eb18e511a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4832.44, + 'creator': 'Arzu Film', + 'timestamp': 1469778212, + 'upload_date': '20160729', + 'release_year': 1976, + 'view_count': int, + 'tags': ['Aile', 'Komedi', 'Klasikler'], + }, + }, { + # episode, geo restricted, bypassable with --geo-verification-proxy + 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', + 'only_matching': True, + }, { + # 4k, with subtitles + 'url': 'https://puhutv.com/dip-1-bolum-izle', + 'only_matching': True, + }] + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + 'عربى': 'ar' + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-izle' % display_id), + display_id)['data'] + + video_id = compat_str(info['id']) + title = info.get('name') or info['title']['name'] + if info.get('display_name'): + title = '%s %s' % (title, info.get('display_name')) + + try: + videos = self._download_json( + 'https://puhutv.com/api/assets/%s/videos' % video_id, + display_id, 'Downloading video JSON', + headers=self.geo_verification_headers()) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted() + raise + + formats = [] + for video in videos['data']['videos']: + media_url = url_or_none(video.get('url')) + if not media_url: + continue + playlist = video.get('is_playlist') + if video.get('stream_type') == 'hls' and playlist is True: + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = int_or_none(video.get('quality')) + f = { + 'url': media_url, + 'ext': 'mp4', + 'height': quality + } + video_format = video.get('video_format') + if video_format == 'hls' and playlist is False: + format_id = 'hls' + f['protocol'] = 'm3u8_native' + elif video_format == 'mp4': + format_id = 'http' + + else: + continue + if quality: + format_id += '-%sp' % quality + f['format_id'] = format_id + formats.append(f) + self._sort_formats(formats) + + description = try_get( + info, lambda x: x['title']['description'], + compat_str) or info.get('description') + timestamp = unified_timestamp(info.get('created_at')) + creator = try_get( + info, lambda x: x['title']['producer']['name'], compat_str) + + duration = float_or_none( + try_get(info, lambda x: x['content']['duration_in_ms'], int), + scale=1000) + view_count = try_get(info, lambda x: x['content']['watch_count'], int) + + images = try_get( + info, lambda x: x['content']['images']['wide'], dict) or {} + thumbnails = [] + for image_id, image_url in images.items(): + if not isinstance(image_url, compat_str): + continue + if not image_url.startswith(('http', '//')): + image_url = 'https://%s' % image_url + t = parse_resolution(image_id) + t.update({ + 'id': image_id, + 'url': image_url + }) + thumbnails.append(t) + + release_year = try_get(info, lambda x: x['title']['released_at'], int) + + season_number = int_or_none(info.get('season_number')) + season_id = str_or_none(info.get('season_id')) + episode_number = int_or_none(info.get('episode_number')) + + tags = [] + for genre in try_get(info, lambda x: x['title']['genres'], list) or []: + if not isinstance(genre, dict): + continue + genre_name = genre.get('name') + if genre_name and isinstance(genre_name, compat_str): + tags.append(genre_name) + + subtitles = {} + for subtitle in try_get( + info, lambda x: x['content']['subtitles'], list) or []: + if not isinstance(subtitle, dict): + continue + lang = subtitle.get('language') + sub_url = url_or_none(subtitle.get('url')) + if not lang or not isinstance(lang, compat_str) or not sub_url: + continue + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'url': sub_url + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'season_id': season_id, + 'season_number': season_number, + 'episode_number': episode_number, + 'release_year': release_year, + 'timestamp': timestamp, + 'creator': creator, + 'view_count': view_count, + 'duration': duration, + 'tags': tags, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'formats': formats + } + + +class PuhuTVSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay' + IE_NAME = 'puhutv:serie' + _TESTS = [{ + 'url': 'https://puhutv.com/deniz-yildizi-detay', + 'info_dict': { + 'title': 'Deniz Yıldızı', + 'id': 'deniz-yildizi', + }, + 'playlist_mincount': 205, + }, { + # a film detail page which is using same url with serie page + 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', + 'only_matching': True, + }] + + def _extract_entries(self, seasons): + for season in seasons: + season_id = season.get('id') + if not season_id: + continue + page = 1 + has_more = True + while has_more is True: + season = self._download_json( + 'https://galadriel.puhutv.com/seasons/%s' % season_id, + season_id, 'Downloading page %s' % page, query={ + 'page': page, + 'per': 40, + }) + episodes = season.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + slug_path = str_or_none(ep.get('slugPath')) + if not slug_path: + continue + video_id = str_or_none(int_or_none(ep.get('id'))) + yield self.url_result( + 'https://puhutv.com/%s' % slug_path, + ie=PuhuTVIE.ie_key(), video_id=video_id, + video_title=ep.get('name') or ep.get('eventLabel')) + page += 1 + has_more = season.get('hasMore') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-detay' % playlist_id), + playlist_id)['data'] + + seasons = info.get('seasons') + if seasons: + return self.playlist_result( + self._extract_entries(seasons), playlist_id, info.get('name')) + + # For films, these are using same url with series + video_id = info.get('slug') or info['assets'][0]['slug'] + return self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index d22311031..f916b2619 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -32,6 +32,9 @@ class RaiBaseIE(InfoExtractor): _GEO_BYPASS = False def _extract_relinker_info(self, relinker_url, video_id): + if not re.match(r'https?://', relinker_url): + return {'formats': [{'url': relinker_url}]} + formats = [] geoprotection = None is_live = None @@ -369,6 +372,10 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # Direct MMS URL + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', + 'only_matching': True, }] def _extract_from_content_id(self, content_id, url): diff --git a/youtube_dl/extractor/raywenderlich.py b/youtube_dl/extractor/raywenderlich.py index 640c3ee23..5411ece21 100644 --- a/youtube_dl/extractor/raywenderlich.py +++ b/youtube_dl/extractor/raywenderlich.py @@ -4,24 +4,37 @@ import re from .common import InfoExtractor from .vimeo import VimeoIE +from ..compat import compat_str from ..utils import ( - extract_attributes, ExtractorError, - smuggle_url, - unsmuggle_url, + int_or_none, + merge_dicts, + try_get, + unescapeHTML, + unified_timestamp, urljoin, ) class RayWenderlichIE(InfoExtractor): - _VALID_URL = r'https?://videos\.raywenderlich\.com/courses/(?P<course_id>[^/]+)/lessons/(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<course_id>[^/]+)/lessons/(?P<id>\d+) + ''' _TESTS = [{ - 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1', 'info_dict': { 'id': '248377018', 'ext': 'mp4', - 'title': 'Testing In iOS Episode 1: Introduction', + 'title': 'Introduction', + 'description': 'md5:804d031b3efa9fcb49777d512d74f722', + 'timestamp': 1513906277, + 'upload_date': '20171222', 'duration': 133, 'uploader': 'Ray Wenderlich', 'uploader_id': 'user3304672', @@ -34,69 +47,133 @@ class RayWenderlichIE(InfoExtractor): 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'only_matching': True, + }] + + @staticmethod + def _extract_video_id(data, lesson_id): + if not data: + return + groups = try_get(data, lambda x: x['groups'], list) or [] + if not groups: + return + for group in groups: + if not isinstance(group, dict): + continue + contents = try_get(data, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + ordinal = int_or_none(content.get('ordinal')) + if ordinal != lesson_id: + continue + video_id = content.get('identifier') + if video_id: + return compat_str(video_id) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id, lesson_id = mobj.group('course_id', 'id') + display_id = '%s/%s' % (course_id, lesson_id) + + webpage = self._download_webpage(url, display_id) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image', webpage, 'thumbnail') + + if '>Subscribe to unlock' in webpage: + raise ExtractorError( + 'This content is only available for subscribers', + expected=True) + + info = { + 'thumbnail': thumbnail, + } + + vimeo_id = self._search_regex( + r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) + + if not vimeo_id: + data = self._parse_json( + self._search_regex( + r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, + 'data collection', default='{}', group='data'), + display_id, transform_source=unescapeHTML, fatal=False) + video_id = self._extract_video_id( + data, lesson_id) or self._search_regex( + r'/videos/(\d+)/', thumbnail, 'video id') + headers = { + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + csrf_token = self._html_search_meta( + 'csrf-token', webpage, 'csrf token', default=None) + if csrf_token: + headers['X-CSRF-Token'] = csrf_token + video = self._download_json( + 'https://videos.raywenderlich.com/api/v1/videos/%s.json' + % video_id, display_id, headers=headers)['video'] + vimeo_id = video['clips'][0]['provider_id'] + info.update({ + '_type': 'url_transparent', + 'title': video.get('name'), + 'description': video.get('description') or video.get( + 'meta_description'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('created_at')), + }) + + return merge_dicts(info, self.url_result( + VimeoIE._smuggle_referrer( + 'https://player.vimeo.com/video/%s' % vimeo_id, url), + ie=VimeoIE.ie_key(), video_id=vimeo_id)) + + +class RayWenderlichCourseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<id>[^/]+) + ''' + + _TEST = { + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios', 'info_dict': { 'title': 'Testing in iOS', - 'id': '105-testing-in-ios', + 'id': '3530-testing-in-ios', }, 'params': { 'noplaylist': False, }, 'playlist_count': 29, - }] + } + + @classmethod + def suitable(cls, url): + return False if RayWenderlichIE.suitable(url) else super( + RayWenderlichCourseIE, cls).suitable(url) def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) + course_id = self._match_id(url) - mobj = re.match(self._VALID_URL, url) - course_id, lesson_id = mobj.group('course_id', 'id') - video_id = '%s/%s' % (course_id, lesson_id) - - webpage = self._download_webpage(url, video_id) - - no_playlist = self._downloader.params.get('noplaylist') - if no_playlist or smuggled_data.get('force_video', False): - if no_playlist: - self.to_screen( - 'Downloading just video %s because of --no-playlist' - % video_id) - if '>Subscribe to unlock' in webpage: - raise ExtractorError( - 'This content is only available for subscribers', - expected=True) - vimeo_id = self._search_regex( - r'data-vimeo-id=["\'](\d+)', webpage, 'video id') - return self.url_result( - VimeoIE._smuggle_referrer( - 'https://player.vimeo.com/video/%s' % vimeo_id, url), - ie=VimeoIE.ie_key(), video_id=vimeo_id) - - self.to_screen( - 'Downloading playlist %s - add --no-playlist to just download video' - % course_id) - - lesson_ids = set((lesson_id, )) - for lesson in re.findall( - r'(<a[^>]+\bclass=["\']lesson-link[^>]+>)', webpage): - attrs = extract_attributes(lesson) - if not attrs: - continue - lesson_url = attrs.get('href') - if not lesson_url: - continue - lesson_id = self._search_regex( - r'/lessons/(\d+)', lesson_url, 'lesson id', default=None) - if not lesson_id: - continue - lesson_ids.add(lesson_id) + webpage = self._download_webpage(url, course_id) entries = [] - for lesson_id in sorted(lesson_ids): + lesson_urls = set() + for lesson_url in re.findall( + r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): + if lesson_url in lesson_urls: + continue + lesson_urls.add(lesson_url) entries.append(self.url_result( - smuggle_url(urljoin(url, lesson_id), {'force_video': True}), - ie=RayWenderlichIE.ie_key())) + urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) - title = self._search_regex( - r'class=["\']course-title[^>]+>([^<]+)', webpage, 'course title', - default=None) + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index afa7b9161..ae7413fb5 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -53,7 +53,8 @@ class RBMARadioIE(InfoExtractor): 'format_id': compat_str(abr), 'abr': abr, 'vcodec': 'none', - } for abr in (96, 128, 256)] + } for abr in (96, 128, 192, 256)] + self._check_formats(formats, episode_id) description = clean_html(episode.get('longTeaser')) thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index bf200ea4d..8c016a77d 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -19,7 +19,7 @@ class RDSIE(InfoExtractor): 'info_dict': { 'id': '604333', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Fowler Jr. prend la direction de Jacksonville', 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', 'timestamp': 1430397346, diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index 243603676..7e8d58f38 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -10,7 +10,7 @@ from ..utils import ( class RedBullTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull\.tv/video/(?P<id>AP-\w+)' + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com/(?:[^/]+/)?tv)/video/(?P<id>AP-\w+)' _TESTS = [{ # film 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', @@ -35,6 +35,9 @@ class RedBullTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 879bcf81d..10311a81a 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, str_to_int, unified_strdate, + url_or_none, ) @@ -71,8 +71,8 @@ class RedTubeIE(InfoExtractor): video_id, fatal=False) if medias and isinstance(medias, list): for media in medias: - format_url = media.get('videoUrl') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(media.get('videoUrl')) + if not format_url: continue format_id = media.get('quality') formats.append({ diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index 8bcf87126..7c8909d95 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -6,6 +6,7 @@ from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, + url_or_none, ) @@ -37,8 +38,8 @@ class RENTVIE(InfoExtractor): title = config['title'] formats = [] for video in config['src']: - src = video.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(video.get('src')) + if not src: continue ext = determine_ext(src) if ext == 'm3u8': diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 8b703800e..857434540 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -50,7 +50,7 @@ class RoosterTeethIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 28cc5522d..3b0f3080b 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - int_or_none, ExtractorError, + float_or_none, + int_or_none, + strip_or_none, ) @@ -14,20 +18,19 @@ class RTBFIE(InfoExtractor): (?: video/[^?]+\?.*\bid=| ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| - auvio/[^/]+\?.*id= + auvio/[^/]+\?.*\b(?P<live>l)?id= )(?P<id>\d+)''' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', + 'md5': '8c876a1cceeb6cf31b476461ade72384', 'info_dict': { 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'description': 'Football - Diables Rouges', - 'duration': 3099, + 'description': '(du 25/04/2014)', + 'duration': 3099.54, 'upload_date': '20140425', - 'timestamp': 1398456336, - 'uploader': 'rtbfsport', + 'timestamp': 1398456300, } }, { # geo restricted @@ -39,6 +42,18 @@ class RTBFIE(InfoExtractor): }, { 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, }] _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' _PROVIDERS = { @@ -53,46 +68,94 @@ class RTBFIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'http://www.rtbf.be/api/media/video?method=getVideoDetail&args[]=%s' % video_id, video_id) + live, media_id = re.match(self._VALID_URL, url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) error = data.get('error') if error: raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - data = data['data'] - provider = data.get('provider') if provider in self._PROVIDERS: return self.url_result(data['url'], self._PROVIDERS[provider]) + title = data['title'] + is_live = data.get('isLive') + if is_live: + title = self._live_title(title) + height_re = r'-(\d+)p\.' formats = [] - for key, format_id in self._QUALITIES: - format_url = data.get(key + 'Url') - if format_url: + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats[:]: + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': fix_url(format_url), + 'height': height, }) - thumbnails = [] - for thumbnail_id, thumbnail_url in data.get('thumbnail', {}).items(): - if thumbnail_id != 'default': - thumbnails.append({ - 'url': self._IMAGE_HOST + thumbnail_url, - 'id': thumbnail_id, - }) + mpd_url = data.get('urlDash') + if not data.get('drm') and mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) return { - 'id': video_id, + 'id': media_id, 'formats': formats, - 'title': data['title'], - 'description': data.get('description') or data.get('subtitle'), - 'thumbnails': thumbnails, - 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': int_or_none(data.get('created')), - 'view_count': int_or_none(data.get('viewCount')), - 'uploader': data.get('channel'), - 'tags': data.get('tags'), + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 89d89b65a..261bcbb83 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -16,6 +16,7 @@ from ..utils import ( int_or_none, try_get, unified_timestamp, + url_or_none, ) @@ -176,8 +177,8 @@ class RutubePlaylistBaseIE(RutubeBaseIE): break for result in results: - video_url = result.get('video_url') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(result.get('video_url')) + if not video_url: continue entry = self._extract_video(result, require_title=False) entry.update({ diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index cc6698f88..30e2a38b4 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -27,7 +27,7 @@ class SafariBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -74,7 +74,14 @@ class SafariBaseIE(InfoExtractor): class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?#&]+)\.html' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html| + videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+) + ) + ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', @@ -94,22 +101,41 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', + 'only_matching': True, }] + _PARTNER_ID = '1926081' + _UICONF_ID = '29375172' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part')) - webpage = self._download_webpage(url, video_id) - reference_id = self._search_regex( - r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura reference id', group='id') - partner_id = self._search_regex( - r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura widget id', group='id') - ui_id = self._search_regex( - r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura uiconf id', group='id') + reference_id = mobj.group('reference_id') + if reference_id: + video_id = reference_id + partner_id = self._PARTNER_ID + ui_id = self._UICONF_ID + else: + video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + mobj = re.match(self._VALID_URL, urlh.geturl()) + reference_id = mobj.group('reference_id') + if not reference_id: + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura widget id', default=self._PARTNER_ID, + group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura uiconf id', default=self._UICONF_ID, + group='id') query = { 'wid': '_%s' % partner_id, @@ -159,10 +185,15 @@ class SafariCourseIE(SafariBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)| + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+| + api/v1/book| + videos/[^/]+ + )| techbus\.safaribooksonline\.com ) - /(?P<id>[^/]+)/?(?:[#?]|$) + /(?P<id>[^/]+) ''' _TESTS = [{ @@ -179,8 +210,16 @@ class SafariCourseIE(SafariBaseIE): }, { 'url': 'http://techbus.safaribooksonline.com/9780134426365', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) + else super(SafariCourseIE, cls).suitable(url)) + def _real_extract(self, url): course_id = self._match_id(url) diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index 8fc66732a..07b766b4a 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -64,7 +64,7 @@ class SinaIE(InfoExtractor): # The video id is in the redirected url self.to_screen('Getting video id') request = HEADRequest(url) - (_, urlh) = self._download_webpage_handle(request, 'NA', False) + _, urlh = self._download_webpage_handle(request, 'NA', False) return self._real_extract(urlh.geturl()) else: pseudo_id = mobj.group('pseudo_id') diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 69951e387..a363221bc 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -19,29 +19,33 @@ from ..utils import ( class SixPlayIE(InfoExtractor): IE_NAME = '6play' - _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.6play.fr/le-meilleur-patissier-p_1807/le-meilleur-patissier-special-fetes-mercredi-a-21-00-sur-m6-c_11638450', - 'md5': '42310bffe4ba3982db112b9cd3467328', + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay.be)/.+?-c_)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', + 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', 'info_dict': { - 'id': '11638450', + 'id': '12041051', 'ext': 'mp4', - 'title': 'Le Meilleur Pâtissier, spécial fêtes mercredi à 21:00 sur M6', - 'description': 'md5:308853f6a5f9e2d55a30fc0654de415f', - 'duration': 39, - 'series': 'Le meilleur pâtissier', + 'title': 'Le but qui a marqué l\'histoire du football français !', + 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', }, - 'params': { - 'skip_download': True, - }, - } + }, { + 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.search(self._VALID_URL, url).groups() + service, consumer_name = { + '6play.fr': ('6play', 'm6web'), + 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + }.get(domain, ('6play', 'm6web')) data = self._download_json( - 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/6play/videos/clip_%s' % video_id, - video_id, query={ + 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), + video_id, headers={ + 'x-customer-name': consumer_name + }, query={ 'csa': 5, 'with': 'clips', }) @@ -65,7 +69,14 @@ class SixPlayIE(InfoExtractor): subtitles.setdefault('fr', []).append({'url': asset_url}) continue if container == 'm3u8' or ext == 'm3u8': - if protocol == 'usp' and not compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + if protocol == 'usp': + if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + urlh = self._request_webpage( + asset_url, video_id, fatal=False, + headers=self.geo_verification_headers()) + if not urlh: + continue + asset_url = urlh.geturl() asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py index 6fc2ff60d..661f9e59d 100644 --- a/youtube_dl/extractor/slutload.py +++ b/youtube_dl/extractor/slutload.py @@ -1,12 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class SlutloadIE(InfoExtractor): - _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'md5': '868309628ba00fd488cf516a113fd717', @@ -16,33 +14,52 @@ class SlutloadIE(InfoExtractor): 'title': 'virginie baisee en cam', 'age_limit': 18, 'thumbnail': r're:https?://.*?\.jpg' - } + }, }, { # mobile site 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url) - webpage = self._download_webpage(desktop_url, video_id) + embed_page = self._download_webpage( + 'http://www.slutload.com/embed_player/%s' % video_id, video_id, + 'Downloading embed page', fatal=False) - video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>', - webpage, 'title').strip() + if embed_page: + def extract(what): + return self._html_search_regex( + r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what, + embed_page, 'video %s' % what, default=None, group='url') - video_url = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"', - webpage, 'video URL') - thumbnail = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"', - webpage, 'thumbnail', fatal=False) + video_url = extract('url') + if video_url: + title = self._html_search_regex( + r'<title>([^<]+)', embed_page, 'title', default=video_id) + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': extract('preview'), + 'age_limit': 18 + } - return { + webpage = self._download_webpage( + 'http://www.slutload.com/video/_/%s/' % video_id, video_id) + title = self._html_search_regex( + r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip() + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ 'id': video_id, - 'url': video_url, - 'title': video_title, - 'thumbnail': thumbnail, - 'age_limit': 18 - } + 'title': title, + 'age_limit': 18, + }) + return info diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 46332e5c2..81c81c8d5 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -181,7 +181,6 @@ class SoundcloudIE(InfoExtractor): thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') - ext = 'mp3' result = { 'id': track_id, 'uploader': info.get('user', {}).get('username'), @@ -215,8 +214,11 @@ class SoundcloudIE(InfoExtractor): track_id, 'Downloading track url', query=query) for key, stream_url in format_dict.items(): - abr = int_or_none(self._search_regex( - r'_(\d+)_url', key, 'audio bitrate', default=None)) + ext, abr = 'mp3', None + mobj = re.search(r'_([^_]+)_(\d+)_url', key) + if mobj: + ext, abr = mobj.groups() + abr = int(abr) if key.startswith('http'): stream_formats = [{ 'format_id': key, @@ -234,13 +236,14 @@ class SoundcloudIE(InfoExtractor): }] elif key.startswith('hls'): stream_formats = self._extract_m3u8_formats( - stream_url, track_id, 'mp3', entry_protocol='m3u8_native', + stream_url, track_id, ext, entry_protocol='m3u8_native', m3u8_id=key, fatal=False) else: continue - for f in stream_formats: - f['abr'] = abr + if abr: + for f in stream_formats: + f['abr'] = abr formats.extend(stream_formats) @@ -250,7 +253,7 @@ class SoundcloudIE(InfoExtractor): formats.append({ 'format_id': 'fallback', 'url': update_url_query(info['stream_url'], query), - 'ext': ext, + 'ext': 'mp3', }) for f in formats: diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index fc995e8c1..4df7f4ddc 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -11,9 +11,9 @@ from .nexx import ( from .spiegeltv import SpiegeltvIE from ..compat import compat_urlparse from ..utils import ( - extract_attributes, - unified_strdate, - get_element_by_attribute, + parse_duration, + strip_or_none, + unified_timestamp, ) @@ -21,35 +21,38 @@ class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': '2c2754212136f35fb4b19767d242f66e', + 'md5': 'b57399839d055fccfeb9a0455c439868', 'info_dict': { - 'id': '1259285', + 'id': '563747', 'ext': 'mp4', 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, 'upload_date': '20130311', + 'timestamp': 1362994320, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': 'f2cdf638d7aa47654e251e1aee360af1', + 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', 'info_dict': { - 'id': '1309159', + 'id': '580988', 'ext': 'mp4', 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, 'upload_date': '20131115', + 'timestamp': 1384546642, }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': 'd8eeca6bfc8f1cd6f490eb1f44695d51', + 'md5': '97b91083a672d72976faa8433430afb9', 'info_dict': { - 'id': '1519126', + 'id': '601883', 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', 'upload_date': '20140904', + 'timestamp': 1409834160, } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', @@ -62,59 +65,28 @@ class SpiegelIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage, handle = self._download_webpage_handle(url, video_id) + metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id + handle = self._request_webpage(metadata_url, video_id) # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') - nexx_id = self._search_regex( - r'nexxOmniaId\s*:\s*(\d+)', webpage, 'nexx id', default=None) - if nexx_id: - domain_id = NexxIE._extract_domain_id(webpage) or '748' - return self.url_result( - 'nexx:%s:%s' % (domain_id, nexx_id), ie=NexxIE.ie_key(), - video_id=nexx_id) - - video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) - - title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) - description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description') - - base_url = self._search_regex( - [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], - webpage, 'server URL', group='url') - - xml_url = base_url + video_id + '.xml' - idoc = self._download_xml(xml_url, video_id) - - formats = [] - for n in list(idoc): - if n.tag.startswith('type') and n.tag != 'type6': - format_id = n.tag.rpartition('type')[2] - video_url = base_url + n.find('./filename').text - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': int(n.find('./width').text), - 'height': int(n.find('./height').text), - 'abr': int(n.find('./audiobitrate').text), - 'vbr': int(n.find('./videobitrate').text), - 'vcodec': n.find('./codec').text, - 'acodec': 'MP4A', - }) - duration = float(idoc[0].findall('./duration')[0].text) - - self._check_formats(formats, video_id) - self._sort_formats(formats) + video_data = self._parse_json(self._webpage_read_content( + handle, metadata_url, video_id), video_id) + title = video_data['title'] + nexx_id = video_data['nexxOmniaId'] + domain_id = video_data.get('nexxOmniaDomain') or '748' return { + '_type': 'url_transparent', 'id': video_id, + 'url': 'nexx:%s:%s' % (domain_id, nexx_id), 'title': title, - 'description': description.strip() if description else None, - 'duration': duration, - 'upload_date': unified_strdate(video_data.get('data-video-date')), - 'formats': formats, + 'description': strip_or_none(video_data.get('teaser')), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datum')), + 'ie_key': NexxIE.ie_key(), } diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index a7b1b3b5f..e76522b45 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -1,55 +1,46 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor -class SpikeIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spike\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' +class BellatorIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bellator\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' _TESTS = [{ - 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', - 'md5': '1a9265f32b0c375793d6c4ce45255256', + 'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', 'info_dict': { - 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', + 'id': 'b55e434e-fde1-4a98-b7cc-92003a034de4', 'ext': 'mp4', - 'title': 'Auction Hunters|December 27, 2013|4|414|Can Allen Ride A Hundred Year-Old Motorcycle?', - 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', - 'timestamp': 1388120400, - 'upload_date': '20131227', + 'title': 'Douglas Lima vs. Paul Daley - Round 1', + 'description': 'md5:805a8dd29310fd611d32baba2f767885', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, { - 'url': 'http://www.spike.com/full-episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-209', - 'md5': 'b25c6f16418aefb9ad5a6cae2559321f', + 'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', + 'only_matching': True, + }] + + _FEED_URL = 'http://www.spike.com/feeds/mrss/' + _GEO_COUNTRIES = ['US'] + + +class ParamountNetworkIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.paramountnetwork.com/episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-13', 'info_dict': { 'id': '37ace3a8-1df6-48be-85b8-38df8229e241', 'ext': 'mp4', 'title': 'Lip Sync Battle|April 28, 2016|2|209|Joel McHale Vs. Jim Rash|Act 1', 'description': 'md5:a739ca8f978a7802f67f8016d27ce114', }, - }, { - 'url': 'http://www.spike.com/video-clips/lhtu8m/', - 'only_matching': True, - }, { - 'url': 'http://www.spike.com/video-clips/lhtu8m', - 'only_matching': True, - }, { - 'url': 'http://bellator.spike.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', - 'only_matching': True, - }, { - 'url': 'http://bellator.spike.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', - 'only_matching': True, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] - _FEED_URL = 'http://www.spike.com/feeds/mrss/' - _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' - _CUSTOM_URL_REGEX = re.compile(r'spikenetworkapp://([^/]+/[-a-fA-F0-9]+)') + _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - - def _extract_mgid(self, webpage): - mgid = super(SpikeIE, self)._extract_mgid(webpage) - if mgid is None: - url_parts = self._search_regex(self._CUSTOM_URL_REGEX, webpage, 'episode_id') - video_type, episode_id = url_parts.split('/', 1) - mgid = 'mgid:arc:{0}:spike.com:{1}'.format(video_type, episode_id) - return mgid diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index 6a6bb90c4..4a410611d 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -72,4 +72,7 @@ class StreamcloudIE(InfoExtractor): 'title': title, 'url': video_url, 'thumbnail': thumbnail, + 'http_headers': { + 'Referer': url, + }, } diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index f71eab8b2..0901c3163 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -12,6 +12,8 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + orderedSet, + strip_or_none, try_get, urljoin, compat_str, @@ -137,7 +139,12 @@ class SVTPlayBaseIE(SVTBaseIE): class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)' + _VALID_URL = r'''(?x) + (?: + svt:(?P<svt_id>[^/?#&]+)| + https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -164,10 +171,40 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/kanaler/svt1', 'only_matching': True, + }, { + 'url': 'svt:1376446-003A', + 'only_matching': True, + }, { + 'url': 'svt:14278044', + 'only_matching': True, }] + def _adjust_title(self, info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + + def _extract_by_video_id(self, video_id, webpage=None): + data = self._download_json( + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, + video_id, headers=self.geo_verification_headers()) + info_dict = self._extract_video(data, video_id) + if not info_dict.get('title'): + title = dict_get(info_dict, ('episode', 'series')) + if not title and webpage: + title = re.sub( + r'\s*\|\s*.+?$', '', self._og_search_title(webpage)) + if not title: + title = video_id + info_dict['title'] = title + self._adjust_title(info_dict) + return info_dict + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, svt_id = mobj.group('id', 'svt_id') + + if svt_id: + return self._extract_by_video_id(svt_id) webpage = self._download_webpage(url, video_id) @@ -179,10 +216,6 @@ class SVTPlayIE(SVTPlayBaseIE): thumbnail = self._og_search_thumbnail(webpage) - def adjust_title(info): - if info['is_live']: - info['title'] = self._live_title(info['title']) - if data: video_info = try_get( data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], @@ -193,24 +226,14 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) - adjust_title(info_dict) + self._adjust_title(info_dict) return info_dict - video_id = self._search_regex( + svt_id = self._search_regex( r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', - webpage, 'video id', default=None) + webpage, 'video id') - if video_id: - data = self._download_json( - 'https://api.svt.se/videoplayer-api/video/%s' % video_id, - video_id, headers=self.geo_verification_headers()) - info_dict = self._extract_video(data, video_id) - if not info_dict.get('title'): - info_dict['title'] = re.sub( - r'\s*\|\s*.+?$', '', - info_dict.get('episode') or self._og_search_title(webpage)) - adjust_title(info_dict) - return info_dict + return self._extract_by_video_id(svt_id, webpage) class SVTSeriesIE(SVTPlayBaseIE): @@ -292,3 +315,57 @@ class SVTSeriesIE(SVTPlayBaseIE): return self.playlist_result( entries, series_id, title, metadata.get('description')) + + +class SVTPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'info_dict': { + 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'info_dict': { + 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', + }, + 'playlist_count': 1, + }, { + # only programTitle + 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', + 'info_dict': { + 'id': '2900353', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', + 'duration': 27, + 'age_limit': 0, + }, + }, { + 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1', + 'only_matching': True, + }, { + 'url': 'https://www.svt.se/vader/manadskronikor/maj2018', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-video-id=["\'](\d+)', webpage))] + + title = strip_or_none(self._og_search_title(webpage, default=None)) + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index edc31729d..784f8ed66 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .turner import TurnerBaseIE +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) from ..utils import ( float_or_none, int_or_none, @@ -38,48 +42,22 @@ class TBSIE(TurnerBaseIE): def _real_extract(self, url): site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_data = self._parse_json(self._search_regex( + drupal_settings = self._parse_json(self._search_regex( r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', - webpage, 'drupal setting'), display_id)['turner_playlist'][0] + webpage, 'drupal setting'), display_id) + video_data = drupal_settings['turner_playlist'][0] media_id = video_data['mediaID'] title = video_data['title'] + tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse( + drupal_settings['ngtv_token_url']).query) - streams_data = self._download_json( - 'http://medium.ngtv.io/media/%s/tv' % media_id, - media_id)['media']['tv'] - duration = None - chapters = [] - formats = [] - for supported_type in ('unprotected', 'bulkaes'): - stream_data = streams_data.get(supported_type, {}) - m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') - if not m3u8_url: - continue - if stream_data.get('playlistProtection') == 'spe': - m3u8_url = self._add_akamai_spe_token( - 'http://token.vgtf.net/token/token_spe', - m3u8_url, media_id, { - 'url': url, - 'site_name': site[:3].upper(), - 'auth_required': video_data.get('authRequired') == '1', - }) - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) - - duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration')) - - if not chapters: - for chapter in stream_data.get('contentSegments', []): - start_time = float_or_none(chapter.get('start')) - duration = float_or_none(chapter.get('duration')) - if start_time is None or duration is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': start_time + duration, - }) - self._sort_formats(formats) + info = self._extract_ngtv_info( + media_id, tokenizer_query, { + 'url': url, + 'site_name': site[:3].upper(), + 'auth_required': video_data.get('authRequired') == '1', + }) thumbnails = [] for image_id, image in video_data.get('images', {}).items(): @@ -98,15 +76,14 @@ class TBSIE(TurnerBaseIE): }) thumbnails.append(i) - return { + info.update({ 'id': media_id, 'title': title, 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), - 'duration': duration, + 'duration': float_or_none(video_data.get('duration')) or info.get('duration'), 'timestamp': int_or_none(video_data.get('created')), 'season_number': int_or_none(video_data.get('season')), 'episode_number': int_or_none(video_data.get('episode')), - 'cahpters': chapters, 'thumbnails': thumbnails, - 'formats': formats, - } + }) + return info diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index f06e5b19a..73469cc5d 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import json -from .common import InfoExtractor +from .turner import TurnerBaseIE from ..utils import ( determine_ext, ExtractorError, @@ -15,8 +15,8 @@ from ..utils import ( ) -class TeamcocoIE(InfoExtractor): - _VALID_URL = r'https?://teamcoco\.com/video/(?P<id>[^/?#]+)' +class TeamcocoIE(TurnerBaseIE): + _VALID_URL = r'https?://teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)' _TESTS = [ { 'url': 'http://teamcoco.com/video/mary-kay-remote', @@ -67,6 +67,18 @@ class TeamcocoIE(InfoExtractor): 'skip_download': True, # m3u8 downloads }, 'skip': 'This video is no longer available.', + }, { + 'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/italy/conan-jordan-schlansky-hit-the-streets-of-florence', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/haiti/conan-s-haitian-history-lesson', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/israel/conan-hits-the-streets-beaches-of-tel-aviv', + 'only_matching': True, } ] @@ -81,7 +93,7 @@ class TeamcocoIE(InfoExtractor): display_id = self._match_id(url) response = self._graphql_call('''{ - %s(slug: "video/%s") { + %s(slug: "%s") { ... on RecordSlug { record { id @@ -91,10 +103,15 @@ class TeamcocoIE(InfoExtractor): thumb { preview } + file { + url + } tags { name } duration + turnerMediaId + turnerMediaAuthToken } } ... on NotFoundSlug { @@ -108,50 +125,65 @@ class TeamcocoIE(InfoExtractor): record = response['record'] video_id = record['id'] - srcs = self._graphql_call('''{ - %s(id: "%s") { - src - } -}''', 'RecordVideoSource', video_id)['src'] - - formats = [] - get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for format_id, src in srcs.items(): - if not isinstance(src, dict): - continue - src_url = src.get('src') - if not src_url: - continue - ext = determine_ext(src_url, mimetype2ext(src.get('type'))) - if format_id == 'hls' or ext == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if src_url.startswith('/'): - src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - else: - if src_url.startswith('/mp4:protected/'): - # TODO Correct extraction for these files - continue - tbr = int_or_none(self._search_regex( - r'(\d+)k\.mp4', src_url, 'tbr', default=None)) - - formats.append({ - 'url': src_url, - 'ext': ext, - 'tbr': tbr, - 'format_id': format_id, - 'quality': get_quality(format_id), - }) - self._sort_formats(formats) - - return { + info = { 'id': video_id, 'display_id': display_id, - 'formats': formats, 'title': record['title'], 'thumbnail': record.get('thumb', {}).get('preview'), 'description': record.get('teaser'), 'duration': parse_duration(record.get('duration')), 'timestamp': parse_iso8601(record.get('publishOn')), } + + media_id = record.get('turnerMediaId') + if media_id: + self._initialize_geo_bypass({ + 'countries': ['US'], + }) + info.update(self._extract_ngtv_info(media_id, { + 'accessToken': record['turnerMediaAuthToken'], + 'accessTokenType': 'jws', + })) + else: + video_sources = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id) or {} + + formats = [] + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in video_sources.get('src', {}).items(): + if not isinstance(src, dict): + continue + src_url = src.get('src') + if not src_url: + continue + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + else: + if src_url.startswith('/mp4:protected/'): + # TODO Correct extraction for these files + continue + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) + + formats.append({ + 'url': src_url, + 'ext': ext, + 'tbr': tbr, + 'format_id': format_id, + 'quality': get_quality(format_id), + }) + if not formats: + formats = self._extract_m3u8_formats( + record['file']['url'], video_id, 'mp4', fatal=False) + self._sort_formats(formats) + info['formats'] = formats + + return info diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 06a27fd04..212ac80ab 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -7,8 +7,10 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + float_or_none, int_or_none, try_get, + url_or_none, ) @@ -30,7 +32,7 @@ class TEDIE(InfoExtractor): ''' _TESTS = [{ 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', - 'md5': '0de43ac406aa3e4ea74b66c9c7789b13', + 'md5': 'b0ce2b05ca215042124fbc9e3886493a', 'info_dict': { 'id': '102', 'ext': 'mp4', @@ -42,24 +44,30 @@ class TEDIE(InfoExtractor): 'uploader': 'Dan Dennett', 'width': 853, 'duration': 1308, - } - }, { - 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', - 'md5': 'b899ac15e345fb39534d913f7606082b', - 'info_dict': { - 'id': 'tSVI8ta_P4w', - 'ext': 'mp4', - 'title': 'Vishal Sikka: The beauty and power of algorithms', - 'thumbnail': r're:^https?://.+\.jpg', - 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', - 'upload_date': '20140122', - 'uploader_id': 'TEDInstitute', - 'uploader': 'TED Institute', + 'view_count': int, + 'comment_count': int, + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + # missing HTTP bitrates + 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', + 'info_dict': { + 'id': '6069', + 'ext': 'mp4', + 'title': 'The beauty and power of algorithms', + 'thumbnail': r're:^https?://.+\.jpg', + 'description': 'md5:734e352710fb00d840ab87ae31aaf688', + 'uploader': 'Vishal Sikka', + }, + 'params': { + 'skip_download': True, }, - 'add_ie': ['Youtube'], }, { 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', - 'md5': '71b3ab2f4233012dce09d515c9c39ce2', + 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', 'info_dict': { 'id': '1972', 'ext': 'mp4', @@ -68,6 +76,9 @@ class TEDIE(InfoExtractor): 'description': 'md5:5174aed4d0f16021b704120360f72b92', 'duration': 1128, }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.ted.com/playlists/who_are_the_hackers', 'info_dict': { @@ -92,17 +103,17 @@ class TEDIE(InfoExtractor): 'skip_download': True, }, }, { - # YouTube video - 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', - 'add_ie': ['Youtube'], + # no nativeDownloads + 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', 'info_dict': { - 'id': 'aFBIPO-P7LM', + 'id': '1792', 'ext': 'mp4', - 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', - 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', - 'uploader': 'TEDx Talks', - 'uploader_id': 'TEDxTalks', - 'upload_date': '20111216', + 'title': 'The orchestra in my mouth', + 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', + 'uploader': 'Tom Thum', + 'view_count': int, + 'comment_count': int, + 'tags': list, }, 'params': { 'skip_download': True, @@ -161,27 +172,16 @@ class TEDIE(InfoExtractor): info = self._extract_info(webpage) - talk_info = try_get( - info, lambda x: x['__INITIAL_DATA__']['talks'][0], - dict) or info['talks'][0] + data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info + talk_info = data['talks'][0] title = talk_info['title'].strip() - external = talk_info.get('external') - if external: - service = external['service'] - self.to_screen('Found video from %s' % service) - ext_url = None - if service.lower() == 'youtube': - ext_url = external.get('code') - return { - '_type': 'url', - 'url': ext_url or external['uri'], - } - native_downloads = try_get( - talk_info, lambda x: x['downloads']['nativeDownloads'], - dict) or talk_info['nativeDownloads'] + talk_info, + (lambda x: x['downloads']['nativeDownloads'], + lambda x: x['nativeDownloads']), + dict) or {} formats = [{ 'url': format_url, @@ -196,10 +196,24 @@ class TEDIE(InfoExtractor): player_talk = talk_info['player_talks'][0] + external = player_talk.get('external') + if isinstance(external, dict): + service = external.get('service') + if isinstance(service, compat_str): + ext_url = None + if service.lower() == 'youtube': + ext_url = external.get('code') + return { + '_type': 'url', + 'url': ext_url or external['uri'], + } + resources_ = player_talk.get('resources') or talk_info.get('resources') http_url = None for format_id, resources in resources_.items(): + if not isinstance(resources, dict): + continue if format_id == 'h264': for resource in resources: h264_url = resource.get('file') @@ -228,8 +242,12 @@ class TEDIE(InfoExtractor): 'tbr': int_or_none(resource.get('bitrate')), }) elif format_id == 'hls': + stream_url = url_or_none(resources.get('stream')) + if not stream_url: + continue formats.extend(self._extract_m3u8_formats( - resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) + stream_url, video_name, 'mp4', m3u8_id=format_id, + fatal=False)) m3u8_formats = list(filter( lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', @@ -239,9 +257,13 @@ class TEDIE(InfoExtractor): bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) if not bitrate: continue + bitrate_url = re.sub(r'\d+k', bitrate, http_url) + if not self._is_valid_url( + bitrate_url, video_name, '%s bitrate' % bitrate): + continue f = m3u8_format.copy() f.update({ - 'url': re.sub(r'\d+k', bitrate, http_url), + 'url': bitrate_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) @@ -267,7 +289,11 @@ class TEDIE(InfoExtractor): 'description': self._og_search_description(webpage), 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, - 'duration': talk_info.get('duration'), + 'duration': float_or_none(talk_info.get('duration')), + 'view_count': int_or_none(data.get('viewed_count')), + 'comment_count': int_or_none( + try_get(data, lambda x: x['comments']['count'])), + 'tags': try_get(talk_info, lambda x: x['tags'], list), } def _get_subtitles(self, video_id, talk_info): diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index fdcc7d573..d37e1b055 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -1,26 +1,43 @@ # coding: utf-8 from __future__ import unicode_literals -from .mitele import MiTeleBaseIE +import json +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + str_or_none, + urljoin, +) -class TelecincoIE(MiTeleBaseIE): +class TelecincoIE(InfoExtractor): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', - 'md5': '8d7b2d5f699ee2709d992a63d5cd1712', 'info_dict': { - 'id': 'JEA5ijCnF6p5W08A1rNKn7', - 'ext': 'mp4', + 'id': '1876350223', 'title': 'Bacalao con kokotxas al pil-pil', 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', - 'duration': 662, }, + 'playlist': [{ + 'md5': 'adb28c37238b675dad0f042292f209a7', + 'info_dict': { + 'id': 'JEA5ijCnF6p5W08A1rNKn7', + 'ext': 'mp4', + 'title': 'Con Martín Berasategui, hacer un bacalao al pil-pil es fácil y divertido', + 'duration': 662, + }, + }] }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '284393e5387b3b947b77c613ef04749a', + 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43', 'info_dict': { 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', @@ -30,7 +47,7 @@ class TelecincoIE(MiTeleBaseIE): }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': '749afab6ea5a136a8806855166ae46a2', + 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', @@ -50,17 +67,90 @@ class TelecincoIE(MiTeleBaseIE): 'only_matching': True, }] + def _parse_content(self, content, url): + video_id = content['dataMediaId'] + if content.get('dataCmsId') == 'ooyala': + return self.url_result( + 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id) + config_url = urljoin(url, content['dataConfig']) + config = self._download_json( + config_url, video_id, 'Downloading config JSON') + title = config['info']['title'] + + def mmc_url(mmc_type): + return re.sub( + r'/(?:flash|html5)\.json', '/%s.json' % mmc_type, + config['services']['mmc']) + + duration = None + formats = [] + for mmc_type in ('flash', 'html5'): + mmc = self._download_json( + mmc_url(mmc_type), video_id, + 'Downloading %s mmc JSON' % mmc_type, fatal=False) + if not mmc: + continue + if not duration: + duration = int_or_none(mmc.get('duration')) + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + gcp = location.get('gcp') + ogn = location.get('ogn') + if None in (gat, gcp, ogn): + continue + token_data = { + 'gcp': gcp, + 'ogn': ogn, + 'sta': 0, + } + media = self._download_json( + gat, video_id, data=json.dumps(token_data).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }, fatal=False) or {} + stream = media.get('stream') or media.get('file') + if not stream: + continue + ext = determine_ext(stream) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), + 'duration': duration, + } + def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, 'title') - info = self._get_player_info(url, webpage) + article = self._parse_json(self._search_regex( + r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})', + webpage, 'article'), display_id)['article'] + title = article.get('title') + description = clean_html(article.get('leadParagraph')) + if article.get('editorialType') != 'VID': + entries = [] + for p in article.get('body', []): + content = p.get('content') + if p.get('type') != 'video' or not content: + continue + entries.append(self._parse_content(content, url)) + return self.playlist_result( + entries, str_or_none(article.get('id')), title, description) + content = article['opening']['content'] + info = self._parse_content(content, url) info.update({ - 'display_id': display_id, - 'title': title, - 'description': self._html_search_meta( - ['og:description', 'twitter:description'], - webpage, 'title', fatal=False), + 'description': description, }) return info diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py index 0c6f70784..a586f30ad 100644 --- a/youtube_dl/extractor/tennistv.py +++ b/youtube_dl/extractor/tennistv.py @@ -32,7 +32,7 @@ class TennisTVIE(InfoExtractor): _NETRC_MACHINE = 'tennistv' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if not username or not password: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e595c4a69..903f47380 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -19,6 +19,7 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'info_dict': { diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index b1a985ff6..ffef5bf06 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -32,13 +32,15 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): + _TP_TLD = 'com' + def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml( smil_url, video_id, note=note, query={'format': 'SMIL'}, headers=self.geo_verification_headers()) error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') if error_element is not None and error_element.attrib['src'].startswith( - 'http://link.theplatform.com/s/errorFiles/Unavailable.'): + 'http://link.theplatform.%s/s/errorFiles/Unavailable.' % self._TP_TLD): raise ExtractorError(error_element.attrib['abstract'], expected=True) smil_formats = self._parse_smil_formats( @@ -66,7 +68,7 @@ class ThePlatformBaseIE(OnceIE): return formats, subtitles def _download_theplatform_metadata(self, path, video_id): - info_url = 'http://link.theplatform.com/s/%s?format=preview' % path + info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path) return self._download_json(info_url, video_id) def _parse_theplatform_metadata(self, info): @@ -308,7 +310,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): class ThePlatformFeedIE(ThePlatformBaseIE): _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s' - _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))' + _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))' _TESTS = [{ # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', @@ -325,6 +327,9 @@ class ThePlatformFeedIE(ThePlatformBaseIE): 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], 'uploader': 'NBCU-NEWS', }, + }, { + 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01', + 'only_matching': True, }] def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None): diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 36f6c1673..a51fa6515 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -36,7 +36,7 @@ class TubiTvIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 786143525..edbb0aa69 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,11 +4,18 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + urlencode_postdata +) class TumblrIE(InfoExtractor): _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' + _NETRC_MACHINE = 'tumblr' + _LOGIN_URL = 'https://www.tumblr.com/login' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', @@ -97,6 +104,45 @@ class TumblrIE(InfoExtractor): 'add_ie': ['Instagram'], }] + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + login_form.update({ + 'user[email]': username, + 'user[password]': password + }) + + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL, + }) + + # Successful login + if '/dashboard' in urlh.geturl(): + return + + login_errors = self._parse_json( + self._search_regex( + r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, + 'login errors', default='[]'), + None, fatal=False) + if login_errors: + raise ExtractorError( + 'Unable to login: %s' % login_errors[0], expected=True) + + self.report_warning('Login has probably failed') + def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) video_id = m_url.group('id') @@ -105,11 +151,19 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) + redirect_url = compat_str(urlh.geturl()) + if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): + raise ExtractorError( + 'This Tumblr may contain sensitive media. ' + 'Disable safe mode in your account settings ' + 'at https://www.tumblr.com/settings/account#safe_mode', + expected=True) + iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', webpage, 'iframe url', default=None) if iframe_url is None: - return self.url_result(urlh.geturl(), 'Generic') + return self.url_result(redirect_url, 'Generic') iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index e73b64aeb..4a6cbfbb8 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -9,11 +9,13 @@ from ..utils import ( xpath_text, int_or_none, determine_ext, + float_or_none, parse_duration, xpath_attr, update_url_query, ExtractorError, strip_or_none, + url_or_none, ) @@ -23,14 +25,17 @@ class TurnerBaseIE(AdobePassIE): def _extract_timestamp(self, video_data): return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) - def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data): + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None): secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) if not token: query = { 'path': secure_path, - 'videoId': content_id, } + if custom_tokenizer_query: + query.update(custom_tokenizer_query) + else: + query['videoId'] = content_id if ap_data.get('auth_required'): query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) auth = self._download_xml( @@ -150,8 +155,8 @@ class TurnerBaseIE(AdobePassIE): subtitles = {} for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): - track_url = track.get('url') - if not isinstance(track_url, compat_str) or track_url.endswith('/big'): + track_url = url_or_none(track.get('url')) + if not track_url or track_url.endswith('/big'): continue lang = track.get('lang') or track.get('label') or 'en' subtitles.setdefault(lang, []).append({ @@ -188,3 +193,42 @@ class TurnerBaseIE(AdobePassIE): 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), 'is_live': is_live, } + + def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): + streams_data = self._download_json( + 'http://medium.ngtv.io/media/%s/tv' % media_id, + media_id)['media']['tv'] + duration = None + chapters = [] + formats = [] + for supported_type in ('unprotected', 'bulkaes'): + stream_data = streams_data.get(supported_type, {}) + m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') + if not m3u8_url: + continue + if stream_data.get('playlistProtection') == 'spe': + m3u8_url = self._add_akamai_spe_token( + 'http://token.ngtv.io/token/token_spe', + m3u8_url, media_id, ap_data or {}, tokenizer_query) + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + duration = float_or_none(stream_data.get('totalRuntime')) + + if not chapters: + for chapter in stream_data.get('contentSegments', []): + start_time = float_or_none(chapter.get('start')) + chapter_duration = float_or_none(chapter.get('duration')) + if start_time is None or chapter_duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + chapter_duration, + }) + self._sort_formats(formats) + + return { + 'formats': formats, + 'chapters': chapters, + 'duration': duration, + } diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index cfcce020a..51923e44a 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -1,13 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, parse_iso8601, - try_get, - determine_ext, ) @@ -78,42 +77,25 @@ class TV4IE(InfoExtractor): title = info['title'] - subtitles = {} - formats = [] - # http formats are linked with unresolvable host - for kind in ('hls3', ''): - data = self._download_json( - 'https://prima.tv4play.se/api/web/asset/%s/play.json' % video_id, - video_id, 'Downloading sources JSON', query={ - 'protocol': kind, - 'videoFormat': 'MP4+WEBVTT', - }) - items = try_get(data, lambda x: x['playback']['items']['item']) - if not items: - continue - if isinstance(items, dict): - items = [items] - for item in items: - manifest_url = item.get('url') - if not isinstance(manifest_url, compat_str): - continue - ext = determine_ext(manifest_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=kind, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_akamai_formats( - manifest_url, video_id, { - 'hls': 'tv4play-i.akamaihd.net', - })) - elif ext == 'webvtt': - subtitles = self._merge_subtitles( - subtitles, { - 'sv': [{ - 'url': manifest_url, - 'ext': 'vtt', - }]}) + manifest_url = self._download_json( + 'https://playback-api.b17g.net/media/' + video_id, + video_id, query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls', + })['playbackItem']['manifestUrl'] + formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + manifest_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_ism_formats( + re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), + video_id, ism_id='mss', fatal=False)) if not formats and info.get('is_geo_restricted'): self.raise_geo_restricted(countries=self._GEO_COUNTRIES) @@ -124,7 +106,7 @@ class TV4IE(InfoExtractor): 'id': video_id, 'title': title, 'formats': formats, - 'subtitles': subtitles, + # 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': int_or_none(info.get('duration')), diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py new file mode 100644 index 000000000..4222ff9ee --- /dev/null +++ b/youtube_dl/extractor/tvnet.py @@ -0,0 +1,147 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unescapeHTML, + url_or_none, +) + + +class TVNetIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P<id>\d+)(?:/|$)' + _TESTS = [{ + # video + 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', + 'md5': 'b4d7abe0252c9b47774760b7519c7558', + 'info_dict': { + 'id': '109788', + 'ext': 'mp4', + 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + 'view_count': int, + }, + }, { + # audio + 'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi', + 'md5': 'b5875ce9b0a2eecde029216d0e6db2ae', + 'info_dict': { + 'id': '27017', + 'ext': 'm4a', + 'title': 'VOV1 - Bản tin chiều (10/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + }, { + 'url': 'http://us.tvnet.gov.vn/video/118023/129999/ngay-0705', + 'info_dict': { + 'id': '129999', + 'ext': 'mp4', + 'title': 'VTV1 - Quốc hội với cử tri (11/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + }, { + # live stream + 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', + 'info_dict': { + 'id': '1011', + 'ext': 'mp4', + 'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + # radio live stream + 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', + 'info_dict': { + 'id': '1014', + 'ext': 'm4a', + 'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://us.tvnet.gov.vn/phim/6136/25510/vtv3---ca-mot-doi-an-oan-tap-1-50/phim-truyen-hinh', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, default=None) or self._search_regex( + r'<title>([^<]+)<', webpage, 'title') + title = re.sub(r'\s*-\s*TV Net\s*$', '', title) + + if '/video/' in url or '/radio/' in url: + is_live = False + elif '/kenh-truyen-hinh/' in url: + is_live = True + else: + is_live = None + + data_file = unescapeHTML(self._search_regex( + r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'data file', group='url')) + + stream_urls = set() + formats = [] + for stream in self._download_json(data_file, video_id): + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('url')) + if stream_url in stream_urls or not stream_url: + continue + stream_urls.add(stream_url) + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + # better support for radio streams + if title.startswith('VOV'): + for f in formats: + f.update({ + 'ext': 'm4a', + 'vcodec': 'none', + }) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or unescapeHTML( + self._search_regex( + r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'thumbnail', default=None, group='url')) + + if is_live: + title = self._live_title(title) + + view_count = int_or_none(self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>', + webpage, 'view count', default=None)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'is_live': is_live, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 808571ece..60937616f 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -19,8 +19,8 @@ class TVNowBaseIE(InfoExtractor): _VIDEO_FIELDS = ( 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', - 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', - 'format.defaultImage169Logo') + 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear', + 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo') def _call_api(self, path, video_id, query): return self._download_json( @@ -31,27 +31,42 @@ class TVNowBaseIE(InfoExtractor): video_id = compat_str(info['id']) title = info['title'] - mpd_url = info['manifest']['dashclear'] - if not mpd_url: + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + formats = self._extract_mpd_formats( + url_repl('dash', '.mpd'), video_id, + mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + url_repl('hss', 'Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + url_repl('hls', '.m3u8'), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + if formats: + break + else: if info.get('isDrm'): raise ExtractorError( 'Video %s is DRM protected' % video_id, expected=True) if info.get('geoblocked'): - raise ExtractorError( - 'Video %s is not available from your location due to geo restriction' % video_id, - expected=True) + raise self.raise_geo_restricted() if not info.get('free', True): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - - mpd_url = update_url_query(mpd_url, {'filter': ''}) - formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False) - formats.extend(self._extract_ism_formats( - mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'), - video_id, ism_id='mss', fatal=False)) - formats.extend(self._extract_m3u8_formats( - mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) description = info.get('articleLong') or info.get('articleShort') @@ -88,7 +103,7 @@ class TVNowBaseIE(InfoExtractor): class TVNowIE(TVNowBaseIE): _VALID_URL = r'''(?x) https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/ (?P<show_id>[^/]+)/ (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+) ''' @@ -140,11 +155,13 @@ class TVNowIE(TVNowBaseIE): }] def _real_extract(self, url): - display_id = '%s/%s' % re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + display_id = '%s/%s' % mobj.group(2, 3) info = self._call_api( 'movies/' + display_id, display_id, query={ 'fields': ','.join(self._VIDEO_FIELDS), + 'station': mobj.group(1), }) return self._extract_video(info, display_id) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index e09b5f804..d3adab457 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -19,6 +19,7 @@ from ..utils import ( try_get, unsmuggle_url, update_url_query, + url_or_none, ) @@ -255,7 +256,8 @@ class TVPlayIE(InfoExtractor): quality = qualities(['hls', 'medium', 'high']) formats = [] for format_id, video_url in streams.get('streams', {}).items(): - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(video_url) + if not video_url: continue ext = determine_ext(video_url) if ext == 'f4m': diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 3ee2af52e..b39972b1e 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import itertools import re import random +import json from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_kwargs, compat_parse_qs, compat_str, @@ -26,7 +26,7 @@ from ..utils import ( try_get, unified_timestamp, update_url_query, - urlencode_postdata, + url_or_none, urljoin, ) @@ -36,8 +36,9 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' - _LOGIN_URL = 'https://www.twitch.tv/login' - _CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6' + _LOGIN_FORM_URL = 'https://www.twitch.tv/login' + _LOGIN_POST_URL = 'https://passport.twitch.tv/login' + _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' _NETRC_MACHINE = 'twitch' def _handle_error(self, response): @@ -61,7 +62,7 @@ class TwitchBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -76,22 +77,21 @@ class TwitchBaseIE(InfoExtractor): page_url = urlh.geturl() post_url = self._search_regex( r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, - 'post url', default=page_url, group='url') + 'post url', default=self._LOGIN_POST_URL, group='url') post_url = urljoin(page_url, post_url) - headers = {'Referer': page_url} + headers = { + 'Referer': page_url, + 'Origin': page_url, + 'Content-Type': 'text/plain;charset=UTF-8', + } - try: - response = self._download_json( - post_url, None, note, - data=urlencode_postdata(form), - headers=headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - response = self._parse_json( - e.cause.read().decode('utf-8'), None) - fail(response.get('message') or response['errors'][0]) - raise + response = self._download_json( + post_url, None, note, data=json.dumps(form).encode(), + headers=headers, expected_status=400) + error = response.get('error_description') or response.get('error_code') + if error: + fail(error) if 'Authenticated successfully' in response.get('message', ''): return None, None @@ -104,7 +104,7 @@ class TwitchBaseIE(InfoExtractor): headers=headers) login_page, handle = self._download_webpage_handle( - self._LOGIN_URL, None, 'Downloading login page') + self._LOGIN_FORM_URL, None, 'Downloading login page') # Some TOR nodes and public proxies are blocked completely if 'blacklist_message' in login_page: @@ -114,6 +114,7 @@ class TwitchBaseIE(InfoExtractor): login_page, handle, 'Logging in', { 'username': username, 'password': password, + 'client_id': self._CLIENT_ID, }) # Successful login @@ -239,7 +240,7 @@ class TwitchVodIE(TwitchItemBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v|videos)/| + (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| player\.twitch\.tv/\?.*?\bvideo=v ) (?P<id>\d+) @@ -295,6 +296,9 @@ class TwitchVodIE(TwitchItemBaseIE): }, { 'url': 'https://m.twitch.tv/beagsandjam/v/247478721', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/northernlion/video/291940395', + 'only_matching': True, }] def _real_extract(self, url): @@ -663,8 +667,8 @@ class TwitchClipsIE(TwitchBaseIE): for option in status['quality_options']: if not isinstance(option, dict): continue - source = option.get('source') - if not source or not isinstance(source, compat_str): + source = url_or_none(option.get('source')) + if not source: continue formats.append({ 'url': source, diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index d7e425041..de41065d6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -63,7 +63,7 @@ class TwitterCardIE(TwitterBaseIE): 'id': '623160978427936768', 'ext': 'mp4', 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*(?:\bformat=|\.)jpg', + 'thumbnail': r're:^https?://.*$', }, }, { @@ -108,6 +108,8 @@ class TwitterCardIE(TwitterBaseIE): }, ] + _API_BASE = 'https://api.twitter.com/1.1' + def _parse_media_info(self, media_info, video_id): formats = [] for media_variant in media_info.get('variants', []): @@ -149,7 +151,7 @@ class TwitterCardIE(TwitterBaseIE): main_script, 'bearer token') # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id api_data = self._download_json( - 'https://api.twitter.com/1.1/statuses/show/%s.json' % video_id, + '%s/statuses/show/%s.json' % (self._API_BASE, video_id), video_id, 'Downloading API data', headers={ 'Authorization': 'Bearer ' + bearer_token, @@ -223,15 +225,49 @@ class TwitterCardIE(TwitterBaseIE): formats.extend(self._extract_mobile_formats(username, video_id)) if formats: + title = self._search_regex(r'<title>([^<]+)', webpage, 'title') + thumbnail = config.get('posterImageUrl') or config.get('image_src') + duration = float_or_none(config.get('duration'), scale=1000) or duration break + if not formats: + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + 'Referer': url, + } + ct0 = self._get_cookies(url).get('ct0') + if ct0: + headers['csrf_token'] = ct0.value + guest_token = self._download_json( + '%s/guest/activate.json' % self._API_BASE, video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = guest_token + self._set_cookie('api.twitter.com', 'gt', guest_token) + config = self._download_json( + '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id), + video_id, headers=headers) + track = config['track'] + vmap_url = track.get('vmapUrl') + if vmap_url: + formats = self._extract_formats_from_vmap_url(vmap_url, video_id) + else: + playback_url = track['playbackUrl'] + if determine_ext(playback_url) == 'm3u8': + formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + else: + formats = [{ + 'url': playback_url, + }] + title = 'Twitter web player' + thumbnail = config.get('posterImage') + duration = float_or_none(track.get('durationMs'), scale=1000) + self._remove_duplicate_formats(formats) self._sort_formats(formats) - title = self._search_regex(r'([^<]+)', webpage, 'title') - thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration'), scale=1000) or duration - return { 'id': video_id, 'title': title, @@ -375,6 +411,22 @@ class TwitterIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, { + # card via api.twitter.com/1.1/videos/tweet/config + 'url': 'https://twitter.com/LisPower1/status/1001551623938805763', + 'info_dict': { + 'id': '1001551623938805763', + 'ext': 'mp4', + 'title': 're:.*?Shep is on a roll today.*?', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b', + 'uploader': 'Lis Power', + 'uploader_id': 'LisPower1', + 'duration': 111.278, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 0a74a9768..79c45f80e 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -20,6 +20,7 @@ from ..utils import ( sanitized_Request, try_get, unescapeHTML, + url_or_none, urlencode_postdata, ) @@ -151,7 +152,7 @@ class UdemyIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -265,8 +266,8 @@ class UdemyIE(InfoExtractor): if not isinstance(source_list, list): return for source in source_list: - video_url = source.get('file') or source.get('src') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(source.get('file') or source.get('src')) + if not video_url: continue if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -293,8 +294,8 @@ class UdemyIE(InfoExtractor): continue if track.get('kind') != 'captions': continue - src = track.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(track.get('src')) + if not src: continue lang = track.get('language') or track.get( 'srclang') or track.get('label') @@ -314,8 +315,8 @@ class UdemyIE(InfoExtractor): for cc in captions: if not isinstance(cc, dict): continue - cc_url = cc.get('url') - if not cc_url or not isinstance(cc_url, compat_str): + cc_url = url_or_none(cc.get('url')) + if not cc_url: continue lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) sub_dict = (automatic_captions if cc.get('source') == 'auto' diff --git a/youtube_dl/extractor/ufctv.py b/youtube_dl/extractor/ufctv.py index ab823814b..f3eaee6b3 100644 --- a/youtube_dl/extractor/ufctv.py +++ b/youtube_dl/extractor/ufctv.py @@ -3,13 +3,16 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, parse_duration, parse_iso8601, + urlencode_postdata, ) class UFCTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ufc\.tv/video/(?P[^/]+)' + _NETRC_MACHINE = 'ufctv' _TEST = { 'url': 'https://www.ufc.tv/video/ufc-219-countdown-full-episode', 'info_dict': { @@ -26,6 +29,21 @@ class UFCTVIE(InfoExtractor): } } + def _real_initialize(self): + username, password = self._get_login_info() + if username is None: + return + + code = self._download_json( + 'https://www.ufc.tv/secure/authenticate', + None, 'Logging in', data=urlencode_postdata({ + 'username': username, + 'password': password, + 'format': 'json', + })).get('code') + if code and code != 'loginsuccess': + raise ExtractorError(code, expected=True) + def _real_extract(self, url): display_id = self._match_id(url) video_data = self._download_json(url, display_id, query={ diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py index 80a643dfe..31eee0ba7 100644 --- a/youtube_dl/extractor/vessel.py +++ b/youtube_dl/extractor/vessel.py @@ -75,7 +75,7 @@ class VesselIE(InfoExtractor): 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index c21a09c01..fe7a26b62 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -24,6 +24,7 @@ class VGTVIE(XstreamIE): 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', 'tv.aftonbladet.se/abtv': 'abtv', + 'www.aftonbladet.se/tv': 'abtv', } _APP_NAME_TO_VENDOR = { @@ -44,7 +45,7 @@ class VGTVIE(XstreamIE): (?: (?:\#!/)?(?:video|live)/| embed?.*id=| - articles/ + a(?:rticles)?/ )| (?P %s @@ -143,6 +144,10 @@ class VGTVIE(XstreamIE): 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, }, + { + 'url': 'https://www.aftonbladet.se/tv/a/36015', + 'only_matching': True, + }, { 'url': 'abtv:140026', 'only_matching': True, @@ -178,13 +183,15 @@ class VGTVIE(XstreamIE): streams = data['streamUrls'] stream_type = data.get('streamType') - + is_live = stream_type == 'live' formats = [] hls_url = streams.get('hls') if hls_url: formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + hls_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) hds_url = streams.get('hds') if hds_url: @@ -229,13 +236,13 @@ class VGTVIE(XstreamIE): info.update({ 'id': video_id, - 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], + 'title': self._live_title(data['title']) if is_live else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'is_live': True if stream_type == 'live' else False, + 'is_live': is_live, }) return info diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 59adb2377..174e69cd6 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -3,15 +3,13 @@ from __future__ import unicode_literals import itertools from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, float_or_none, parse_iso8601, + url_or_none, ) @@ -166,8 +164,8 @@ class VidmeIE(InfoExtractor): formats = [] for f in video.get('formats', []): - format_url = f.get('uri') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(f.get('uri')) + if not format_url: continue format_type = f.get('type') if format_type == 'dash': diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 9026e778c..d70283479 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -54,7 +54,8 @@ class VidziIE(InfoExtractor): self._search_regex( r'setup\(([^)]+)\)', code, 'jwplayer data', default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=js_to_json) + video_id, transform_source=lambda s: js_to_json( + re.sub(r'\s*\+\s*window\[.+?\]', '', s))) if jwplayer_data: break diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 1f29c273f..c43d1a1e8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -1,24 +1,27 @@ from __future__ import unicode_literals +import base64 import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, clean_html, determine_ext, int_or_none, js_to_json, + parse_age_limit, parse_duration, ) class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv' + _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' class ViewLiftEmbedIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -60,8 +63,10 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): formats = [] has_bitrate = False - for source in self._parse_json(js_to_json(self._search_regex( - r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): + sources = self._parse_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', webpage, + 'sources', default='[]'), video_id, js_to_json) + for source in sources: file_ = source.get('file') if not file_: continue @@ -70,7 +75,8 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): format_id = source.get('label') or ext if all(v in ('m3u8', 'hls') for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', m3u8_id='hls')) + file_, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: bitrate = int_or_none(self._search_regex( [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], @@ -85,6 +91,13 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'tbr': bitrate, 'height': height, }) + if not formats: + hls_url = self._parse_json(self._search_regex( + r'filmInfo\.src\s*=\s*({.+?});', + webpage, 'src'), video_id, js_to_json)['src'] + formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') self._sort_formats(formats, field_preference) @@ -109,10 +122,13 @@ class ViewLiftIE(ViewLiftBaseIE): 'display_id': 'lost_for_life', 'ext': 'mp4', 'title': 'Lost for Life', - 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', + 'description': 'md5:ea10b5a50405ae1f7b5269a6ec594102', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 4489, - 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] + 'categories': 'mincount:3', + 'age_limit': 14, + 'upload_date': '20150421', + 'timestamp': 1429656819, } }, { 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', @@ -125,7 +141,9 @@ class ViewLiftIE(ViewLiftBaseIE): 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 979, - 'categories': ['Documentary', 'Sports', 'Politics'] + 'categories': 'mincount:2', + 'timestamp': 1399478279, + 'upload_date': '20140507', } }, { # Film is not playable in your area. @@ -138,9 +156,6 @@ class ViewLiftIE(ViewLiftBaseIE): }, { 'url': 'http://www.winnersview.com/videos/the-good-son', 'only_matching': True, - }, { - 'url': 'http://www.kesari.tv/news/video/1461919076414', - 'only_matching': True, }, { # Was once Kaltura embed 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', @@ -156,45 +171,96 @@ class ViewLiftIE(ViewLiftBaseIE): raise ExtractorError( 'Film %s is not available.' % display_id, expected=True) - film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + initial_store_state = self._search_regex( + r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)", + webpage, 'Initial Store State', default=None) + if initial_store_state: + modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode( + initial_store_state).decode()), display_id)['page']['data']['modules'] + content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule') + gist = content_data['gist'] + film_id = gist['id'] + title = gist['title'] + video_assets = content_data['streamingInfo']['videoAssets'] - snag = self._parse_json( - self._search_regex( - r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), - display_id) + formats = [] + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: + continue + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) - for item in snag: - if item.get('data', {}).get('film', {}).get('id') == film_id: - data = item['data']['film'] - title = data['title'] - description = clean_html(data.get('synopsis')) - thumbnail = data.get('image') - duration = int_or_none(data.get('duration') or data.get('runtime')) - categories = [ - category['title'] for category in data.get('categories', []) - if category.get('title')] - break + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'format_id')) + + info = { + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), + 'formats': formats, + } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info else: - title = self._search_regex( - r'itemprop="title">([^<]+)<', webpage, 'title') - description = self._html_search_regex( - r'(?s)
(.+?)
', - webpage, 'description', default=None) or self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'([^<]+)<', - webpage, 'duration', fatal=False)) - categories = re.findall(r'([^<]+)', webpage) + film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'categories': categories, - 'ie_key': 'ViewLiftEmbed', - } + snag = self._parse_json( + self._search_regex( + r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'), + display_id) + + for item in snag: + if item.get('data', {}).get('film', {}).get('id') == film_id: + data = item['data']['film'] + title = data['title'] + description = clean_html(data.get('synopsis')) + thumbnail = data.get('image') + duration = int_or_none(data.get('duration') or data.get('runtime')) + categories = [ + category['title'] for category in data.get('categories', []) + if category.get('title')] + break + else: + title = self._search_regex( + r'itemprop="title">([^<]+)<', webpage, 'title') + description = self._html_search_regex( + r'(?s)
(.+?)
', + webpage, 'description', default=None) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'([^<]+)<', + webpage, 'duration', fatal=False)) + categories = re.findall(r'([^<]+)', webpage) + + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'categories': categories, + 'ie_key': 'ViewLiftEmbed', + } diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index ad2a2a4b7..546de95d8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -88,7 +88,7 @@ class VikiBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a026526b2..e49b233f2 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -37,7 +37,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): _LOGIN_URL = 'https://vimeo.com/log_in' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: if self._LOGIN_REQUIRED: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) @@ -539,9 +539,10 @@ class VimeoIE(VimeoBaseInfoExtractor): # We try to find out to which variable is assigned the config dic m_variable_name = re.search(r'(\w)\.video\.id', webpage) if m_variable_name is not None: - config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1)) + config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] else: config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] + config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') config = self._search_regex(config_re, webpage, 'info section', flags=re.DOTALL) config = json.loads(config) @@ -989,10 +990,10 @@ class VimeoWatchLaterIE(VimeoChannelIE): class VimeoLikesIE(InfoExtractor): - _VALID_URL = r'https://(?:www\.)?vimeo\.com/user(?P[0-9]+)/likes/?(?:$|[?#]|sort:)' + _VALID_URL = r'https://(?:www\.)?vimeo\.com/(?P[^/]+)/likes/?(?:$|[?#]|sort:)' IE_NAME = 'vimeo:likes' IE_DESC = 'Vimeo user likes' - _TEST = { + _TESTS = [{ 'url': 'https://vimeo.com/user755559/likes/', 'playlist_mincount': 293, 'info_dict': { @@ -1000,7 +1001,10 @@ class VimeoLikesIE(InfoExtractor): 'description': 'See all the videos urza likes', 'title': 'Videos urza likes', }, - } + }, { + 'url': 'https://vimeo.com/stormlapse/likes', + 'only_matching': True, + }] def _real_extract(self, url): user_id = self._match_id(url) @@ -1009,7 +1013,7 @@ class VimeoLikesIE(InfoExtractor): self._search_regex( r'''(?x)
  • .*?
  • \s* - ''', webpage, 'page count'), + ''', webpage, 'page count', default=1), 'page count', fatal=True) PAGE_SIZE = 12 title = self._html_search_regex( @@ -1017,7 +1021,7 @@ class VimeoLikesIE(InfoExtractor): description = self._html_search_meta('description', webpage) def _get_page(idx): - page_url = 'https://vimeo.com/user%s/likes/page:%d/sort:date' % ( + page_url = 'https://vimeo.com/%s/likes/page:%d/sort:date' % ( user_id, idx + 1) webpage = self._download_webpage( page_url, user_id, @@ -1037,7 +1041,7 @@ class VimeoLikesIE(InfoExtractor): return { '_type': 'playlist', - 'id': 'user%s_likes' % user_id, + 'id': '%s_likes' % user_id, 'title': title, 'description': description, 'entries': pl, diff --git a/youtube_dl/extractor/viqeo.py b/youtube_dl/extractor/viqeo.py new file mode 100644 index 000000000..be7dfa814 --- /dev/null +++ b/youtube_dl/extractor/viqeo.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + str_or_none, + url_or_none, +) + + +class ViqeoIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + viqeo:| + https?://cdn\.viqeo\.tv/embed/*\?.*?\bvid=| + https?://api\.viqeo\.tv/v\d+/data/startup?.*?\bvideo(?:%5B%5D|\[\])= + ) + (?P[\da-f]+) + ''' + _TESTS = [{ + 'url': 'https://cdn.viqeo.tv/embed/?vid=cde96f09d25f39bee837', + 'md5': 'a169dd1a6426b350dca4296226f21e76', + 'info_dict': { + 'id': 'cde96f09d25f39bee837', + 'ext': 'mp4', + 'title': 'cde96f09d25f39bee837', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 76, + }, + }, { + 'url': 'viqeo:cde96f09d25f39bee837', + 'only_matching': True, + }, { + 'url': 'https://api.viqeo.tv/v1/data/startup?video%5B%5D=71bbec412ade45c3216c&profile=112', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://cdn.viqeo.tv/embed/?vid=%s' % video_id, video_id) + + data = self._parse_json( + self._search_regex( + r'SLOT_DATA\s*=\s*({.+?})\s*;', webpage, 'slot data'), + video_id) + + formats = [] + thumbnails = [] + for media_file in data['mediaFiles']: + if not isinstance(media_file, dict): + continue + media_url = url_or_none(media_file.get('url')) + if not media_url or not media_url.startswith(('http', '//')): + continue + media_type = str_or_none(media_file.get('type')) + if not media_type: + continue + media_kind = media_type.split('/')[0].lower() + f = { + 'url': media_url, + 'width': int_or_none(media_file.get('width')), + 'height': int_or_none(media_file.get('height')), + } + format_id = str_or_none(media_file.get('quality')) + if media_kind == 'image': + f['id'] = format_id + thumbnails.append(f) + elif media_kind in ('video', 'audio'): + is_audio = media_kind == 'audio' + f.update({ + 'format_id': 'audio' if is_audio else format_id, + 'fps': int_or_none(media_file.get('fps')), + 'vcodec': 'none' if is_audio else None, + }) + formats.append(f) + self._sort_formats(formats) + + duration = int_or_none(data.get('duration')) + + return { + 'id': video_id, + 'title': video_id, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index 5cf93591c..3bd37525b 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -195,16 +195,29 @@ class ViuOTTIE(InfoExtractor): 'skip': 'Geo-restricted to Hong Kong', }] + _AREA_ID = { + 'HK': 1, + 'SG': 2, + 'TH': 4, + 'PH': 5, + } + def _real_extract(self, url): country_code, video_id = re.match(self._VALID_URL, url).groups() + query = { + 'r': 'vod/ajax-detail', + 'platform_flag_label': 'web', + 'product_id': video_id, + } + + area_id = self._AREA_ID.get(country_code.upper()) + if area_id: + query['area_id'] = area_id + product_data = self._download_json( 'http://www.viu.com/ott/%s/index.php' % country_code, video_id, - 'Downloading video info', query={ - 'r': 'vod/ajax-detail', - 'platform_flag_label': 'web', - 'product_id': video_id, - })['data'] + 'Downloading video info', query=query)['data'] video_data = product_data.get('current_product') if not video_data: @@ -214,6 +227,9 @@ class ViuOTTIE(InfoExtractor): 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, video_id, 'Downloading stream info', query={ 'ccs_product_id': video_data['ccs_product_id'], + }, headers={ + 'Referer': url, + 'Origin': re.search(r'https?://[^/]+', url).group(0), })['data']['stream'] stream_sizes = stream_data.get('size', {}) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index b50d4f170..ef8b9bcb7 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -17,9 +17,11 @@ from ..utils import ( int_or_none, orderedSet, remove_start, + str_or_none, str_to_int, unescapeHTML, unified_timestamp, + url_or_none, urlencode_postdata, ) from .dailymotion import DailymotionIE @@ -32,7 +34,7 @@ class VKBaseIE(InfoExtractor): _NETRC_MACHINE = 'vk' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -105,10 +107,10 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', + 'uploader_id': '-77521', 'duration': 195, - 'timestamp': 1329060660, + 'timestamp': 1329049880, 'upload_date': '20120212', - 'view_count': int, }, }, { @@ -117,12 +119,12 @@ class VKIE(VKBaseIE): 'info_dict': { 'id': '165548505', 'ext': 'mp4', - 'uploader': 'Tom Cruise', 'title': 'No name', + 'uploader': 'Tom Cruise', + 'uploader_id': '205387401', 'duration': 9, - 'timestamp': 1374374880, - 'upload_date': '20130721', - 'view_count': int, + 'timestamp': 1374364108, + 'upload_date': '20130720', } }, { @@ -206,10 +208,10 @@ class VKIE(VKBaseIE): 'id': 'V3K4mi0SYkc', 'ext': 'webm', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", - 'description': 'md5:d9903938abdc74c738af77f527ca0596', - 'duration': 178, + 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', + 'duration': 179, 'upload_date': '20130116', - 'uploader': "Children's Joy Foundation", + 'uploader': "Children's Joy Foundation Inc.", 'uploader_id': 'thecjf', 'view_count': int, }, @@ -221,6 +223,7 @@ class VKIE(VKBaseIE): 'id': 'k3lz2cmXyRuJQSjGHUv', 'ext': 'mp4', 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', + # TODO: fix test by fixing dailymotion description extraction 'description': 'md5:c651358f03c56f1150b555c26d90a0fd', 'uploader': 'AniLibria.Tv', 'upload_date': '20160914', @@ -240,9 +243,12 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'S-Dance, репетиции к The way show', 'uploader': 'THE WAY SHOW | 17 апреля', - 'timestamp': 1454870100, + 'uploader_id': '-110305615', + 'timestamp': 1454859345, 'upload_date': '20160207', - 'view_count': int, + }, + 'params': { + 'skip_download': True, }, }, { @@ -295,7 +301,7 @@ class VKIE(VKBaseIE): video_id = mobj.group('videoid') if video_id: - info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id # Some videos (removed?) can only be downloaded with list id specified list_id = mobj.group('list_id') if list_id: @@ -345,6 +351,9 @@ class VKIE(VKBaseIE): r'This video is no longer available, because its author has been blocked.': 'Video %s is no longer available, because its author has been blocked.', + + r'This video is no longer available, because it has been deleted.': + 'Video %s is no longer available, because it has been deleted.', } for error_re, error_msg in ERRORS.items(): @@ -393,7 +402,8 @@ class VKIE(VKBaseIE): if not data: data = self._parse_json( self._search_regex( - r'\s*({.+?})\s*', info_page, 'json', default='{}'), + [r'\s*({.+?})\s*', r'\s*({.+})'], + info_page, 'json', default='{}'), video_id) if data: data = data['player']['params'][0] @@ -415,7 +425,7 @@ class VKIE(VKBaseIE): timestamp = unified_timestamp(self._html_search_regex( r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, - 'upload date', fatal=False)) + 'upload date', default=None)) or int_or_none(data.get('date')) view_count = str_to_int(self._search_regex( r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', @@ -423,7 +433,8 @@ class VKIE(VKBaseIE): formats = [] for format_id, format_url in data.items(): - if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//', 'rtmp')): continue if (format_id.startswith(('url', 'cache')) or format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): @@ -452,9 +463,12 @@ class VKIE(VKBaseIE): 'title': title, 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), + 'uploader_id': str_or_none(data.get('author_id')), 'duration': data.get('duration'), 'timestamp': timestamp, 'view_count': view_count, + 'like_count': int_or_none(data.get('liked')), + 'dislike_count': int_or_none(data.get('nolikes')), 'is_live': is_live, } diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 64d0224e6..0b5165fd0 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -57,7 +57,7 @@ class VLiveIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s' % video_id, video_id) + 'https://www.vlive.tv/video/%s' % video_id, video_id) VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' VIDEO_PARAMS_FIELD = 'video params' @@ -108,11 +108,11 @@ class VLiveIE(InfoExtractor): def _live(self, video_id, webpage): init_page = self._download_webpage( - 'http://www.vlive.tv/video/init/view', + 'https://www.vlive.tv/video/init/view', video_id, note='Downloading live webpage', data=urlencode_postdata({'videoSeq': video_id}), headers={ - 'Referer': 'http://www.vlive.tv/video/%s' % video_id, + 'Referer': 'https://www.vlive.tv/video/%s' % video_id, 'Content-Type': 'application/x-www-form-urlencoded' }) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 20fef1f04..8ef3e0906 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -19,7 +19,6 @@ class WatIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', - 'md5': '83d882d9de5c9d97f0bb2c6273cde56a', 'info_dict': { 'id': '11713067', 'ext': 'mp4', @@ -28,10 +27,15 @@ class WatIE(InfoExtractor): 'upload_date': '20140819', 'duration': 120, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', + 'md5': 'b16574df2c3cd1a36ca0098f2a791925', 'info_dict': { 'id': '11713075', 'ext': 'mp4', @@ -98,38 +102,25 @@ class WatIE(InfoExtractor): formats = [] try: + alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] manifest_urls = self._download_json( 'http://www.wat.tv/get/webhtml/' + video_id, video_id) m3u8_url = manifest_urls.get('hls') if m3u8_url: m3u8_url = remove_bitrate_limit(m3u8_url) - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + for m3u8_alt_url in alt_urls(m3u8_url): + formats.extend(self._extract_m3u8_formats( + m3u8_alt_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios', 'web').replace('.m3u8', '.f4m'), + m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) - http_url = extract_url('android5/%s.mp4', 'http') - if http_url: - for m3u8_format in m3u8_formats: - vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') - if not vbr or not abr: - continue - format_id = m3u8_format['format_id'].replace('hls', 'http') - fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) - if self._is_valid_url(fmt_url, video_id, format_id): - f = m3u8_format.copy() - f.update({ - 'url': fmt_url, - 'format_id': format_id, - 'protocol': 'http', - }) - formats.append(f) mpd_url = manifest_urls.get('mpd') if mpd_url: - formats.extend(self._extract_mpd_formats(remove_bitrate_limit( - mpd_url), video_id, mpd_id='dash', fatal=False)) + mpd_url = remove_bitrate_limit(mpd_url) + for mpd_alt_url in alt_urls(mpd_url): + formats.extend(self._extract_mpd_formats( + mpd_alt_url, video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) except ExtractorError: abr = 64 diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py index be0bcba15..5a4e46e73 100644 --- a/youtube_dl/extractor/watchbox.py +++ b/youtube_dl/extractor/watchbox.py @@ -10,6 +10,7 @@ from ..utils import ( js_to_json, strip_or_none, try_get, + unescapeHTML, unified_timestamp, ) @@ -67,11 +68,20 @@ class WatchBoxIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - source = self._parse_json( + player_config = self._parse_json( self._search_regex( - r'(?s)source["\']?\s*:\s*({.+?})\s*[,}]', webpage, 'source', - default='{}'), - video_id, transform_source=js_to_json, fatal=False) or {} + r'data-player-conf=(["\'])(?P{.+?})\1', webpage, + 'player config', default='{}', group='data'), + video_id, transform_source=unescapeHTML, fatal=False) + + if not player_config: + player_config = self._parse_json( + self._search_regex( + r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) or {} + + source = player_config.get('source') or {} video_id = compat_str(source.get('videoId') or video_id) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index c022fb33e..3dab9145b 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -36,7 +36,8 @@ class WimpIE(InfoExtractor): webpage = self._download_webpage(url, video_id) youtube_id = self._search_regex( - r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + (r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + r'data-id=["\']([0-9A-Za-z_-]{11})'), webpage, 'video URL', default=None) if youtube_id: return { diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index d1bc992fd..68a48034e 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -13,6 +13,7 @@ from ..utils import ( parse_duration, try_get, unified_strdate, + url_or_none, ) @@ -137,7 +138,8 @@ class XHamsterIE(InfoExtractor): else: format_url = format_item filesize = None - if not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue formats.append({ 'format_id': '%s-%s' % (format_id, quality), @@ -198,7 +200,8 @@ class XHamsterIE(InfoExtractor): default='{}'), video_id, fatal=False) for format_id, format_url in sources.items(): - if not isinstance(format_url, compat_str): + format_url = url_or_none(format_url) + if not format_url: continue if format_url in format_urls: continue diff --git a/youtube_dl/extractor/yapfiles.py b/youtube_dl/extractor/yapfiles.py index 7fafbf596..cfb368de9 100644 --- a/youtube_dl/extractor/yapfiles.py +++ b/youtube_dl/extractor/yapfiles.py @@ -4,12 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, qualities, unescapeHTML, + url_or_none, ) @@ -80,9 +80,9 @@ class YapFilesIE(InfoExtractor): formats = [] for format_id in QUALITIES: is_hd = format_id == 'hd' - format_url = playlist.get( - 'file%s' % ('_hd' if is_hd else '')) - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(playlist.get( + 'file%s' % ('_hd' if is_hd else ''))) + if not format_url: continue formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index f33fabe19..dff69fcb7 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -3,11 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, parse_duration, + url_or_none, ) @@ -50,8 +50,8 @@ class YouJizzIE(InfoExtractor): for encoding in encodings: if not isinstance(encoding, dict): continue - format_url = encoding.get('filename') - if not isinstance(format_url, compat_str): + format_url = url_or_none(encoding.get('filename')) + if not format_url: continue if determine_ext(format_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 547adefeb..ea0bce784 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, sanitized_Request, str_to_int, unescapeHTML, unified_strdate, + url_or_none, ) from ..aes import aes_decrypt_text @@ -88,8 +88,8 @@ class YouPornIE(InfoExtractor): for definition in definitions: if not isinstance(definition, dict): continue - video_url = definition.get('videoUrl') - if isinstance(video_url, compat_str) and video_url: + video_url = url_or_none(definition.get('videoUrl')) + if video_url: links.append(video_url) # Fallback #1, this also contains extra low quality 180p format diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7f4298c08..117a57911 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -37,6 +37,7 @@ from ..utils import ( orderedSet, parse_codecs, parse_duration, + qualities, remove_quotes, remove_start, smuggle_url, @@ -84,7 +85,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. """ - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if username is None: if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: @@ -177,13 +178,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): warn('Unable to extract result entry') return False - tfa = try_get(res, lambda x: x[0][0], list) - if tfa: - tfa_str = try_get(tfa, lambda x: x[2], compat_str) - if tfa_str == 'TWO_STEP_VERIFICATION': + login_challenge = try_get(res, lambda x: x[0][0], list) + if login_challenge: + challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) + if challenge_str == 'TWO_STEP_VERIFICATION': # SEND_SUCCESS - TFA code has been successfully sent to phone # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(tfa, lambda x: x[5], compat_str) + status = try_get(login_challenge, lambda x: x[5], compat_str) if status == 'QUOTA_EXCEEDED': warn('Exceeded the limit of TFA codes, try later') return False @@ -227,6 +228,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): check_cookie_url = try_get( tfa_results, lambda x: x[0][-1][2], compat_str) + else: + CHALLENGES = { + 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", + 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', + 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", + } + challenge = CHALLENGES.get( + challenge_str, + '%s returned error %s.' % (self.IE_NAME, challenge_str)) + warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) + return False else: check_cookie_url = try_get(res, lambda x: x[2], compat_str) @@ -509,6 +521,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', 'license': 'Standard YouTube License', 'creator': 'Icona Pop', + 'track': 'I Love It (feat. Charli XCX)', + 'artist': 'Icona Pop', } }, { @@ -527,6 +541,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', + 'track': 'Tunnel Vision', + 'artist': 'Justin Timberlake', 'age_limit': 18, } }, @@ -596,7 +612,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'IB3lcPjvWLA', 'ext': 'm4a', 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', - 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', + 'description': 'md5:1900ed86ee514927b9e00fbead6969a5', 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', @@ -637,7 +653,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 219, 'upload_date': '20100909', - 'uploader': 'The Amazing Atheist', + 'uploader': 'TJ Kirk', 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'license': 'Standard YouTube License', @@ -667,10 +683,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', 'info_dict': { 'id': '6kLq3WMV1nU', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 247, + 'duration': 246, 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', @@ -732,7 +748,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'AllenMeow', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫艾倫', + 'uploader': '孫ᄋᄅ', 'license': 'Standard YouTube License', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, @@ -759,7 +775,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'info_dict': { 'id': 'FIl7x6_3R5Y', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'duration': 220, @@ -768,8 +784,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'license': 'Standard YouTube License', - 'formats': 'mincount:32', + 'formats': 'mincount:31', }, + 'skip': 'not actual anymore', }, # DASH manifest with segment_list { @@ -884,7 +901,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk', + 'alt_title': 'Dark Walk - Position Music', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', @@ -892,7 +909,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', 'license': 'Standard YouTube License', - 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', + 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk - Position Music', + 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', }, 'params': { 'skip_download': True, @@ -949,7 +968,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:dda0d780d5a6e120758d1711d062a867', 'duration': 4060, 'upload_date': '20151119', - 'uploader': 'Bernie 2016', + 'uploader': 'Bernie Sanders', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'license': 'Creative Commons Attribution license (reuse allowed)', @@ -984,6 +1003,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', }, { # YouTube Red video with episode data @@ -992,7 +1012,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:8013b7ddea787342608f63a13ddc9492', + 'description': 'md5:25b78d2f64ae81719f5c96319889b736', 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', @@ -1025,7 +1045,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', 'license': 'Standard YouTube License', - 'view_count': int, }, 'params': { 'skip_download': True, @@ -1693,128 +1712,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True) - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - self._downloader.report_warning('unable to extract uploader nickname') - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* - ]*>\s* -
  • (?P.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) - else: - video_alt_title = video_creator = None - - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = m_episode.group('series') - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None - - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - - def _extract_count(count_name): - return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), - video_webpage, count_name, default=None)) - - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) - - chapters = self._extract_chapters(description_original, video_duration) - def _extract_filesize(media_url): return int_or_none(self._search_regex( r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) @@ -1844,6 +1741,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': int_or_none(width_height[0]), 'height': int_or_none(width_height[1]), } + q = qualities(['small', 'medium', 'hd720']) formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) @@ -1926,13 +1824,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): filesize = int_or_none(url_data.get( 'clen', [None])[0]) or _extract_filesize(url) + quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0] + more_fields = { 'filesize': filesize, 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, 'height': height, 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], + 'format_note': quality, + 'quality': q(quality), } for key, value in more_fields.items(): if value: @@ -1985,6 +1886,133 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError(error_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + # uploader + video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + if video_uploader: + video_uploader = compat_urllib_parse_unquote_plus(video_uploader) + else: + self._downloader.report_warning('unable to extract uploader name') + + # uploader_id + video_uploader_id = None + video_uploader_url = None + mobj = re.search( + r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', + video_webpage) + if mobj is not None: + video_uploader_id = mobj.group('uploader_id') + video_uploader_url = mobj.group('uploader_url') + else: + self._downloader.report_warning('unable to extract uploader nickname') + + # thumbnail image + # We try first to get a high quality image: + m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + elif 'thumbnail_url' not in video_info: + self._downloader.report_warning('unable to extract video thumbnail') + video_thumbnail = None + else: # don't panic if we can't find it + video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + + # upload date + upload_date = self._html_search_meta( + 'datePublished', video_webpage, 'upload date', default=None) + if not upload_date: + upload_date = self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], + video_webpage, 'upload date', default=None) + upload_date = unified_strdate(upload_date) + + video_license = self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', + video_webpage, 'license', default=None) + + m_music = re.search( + r'''(?x) + <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* + <ul[^>]*>\s* + <li>(?P<title>.+?) + by (?P<creator>.+?) + (?: + \(.+?\)| + <a[^>]* + (?: + \bhref=["\']/red[^>]*>| # drop possible + >\s*Listen ad-free with YouTube Red # YouTube Red ad + ) + .*? + )?</li + ''', + video_webpage) + if m_music: + video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) + video_creator = clean_html(m_music.group('creator')) + else: + video_alt_title = video_creator = None + + def extract_meta(field): + return self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, + video_webpage, field, default=None) + + track = extract_meta('Song') + artist = extract_meta('Artist') + + m_episode = re.search( + r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', + video_webpage) + if m_episode: + series = m_episode.group('series') + season_number = int(m_episode.group('season')) + episode_number = int(m_episode.group('episode')) + else: + series = season_number = episode_number = None + + m_cat_container = self._search_regex( + r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', + video_webpage, 'categories', default=None) + if m_cat_container: + category = self._html_search_regex( + r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', + default=None) + video_categories = None if category is None else [category] + else: + video_categories = None + + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + + def _extract_count(count_name): + return str_to_int(self._search_regex( + r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' + % re.escape(count_name), + video_webpage, count_name, default=None)) + + like_count = _extract_count('like') + dislike_count = _extract_count('dislike') + + # subtitles + video_subtitles = self.extract_subtitles(video_id, video_webpage) + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) + + video_duration = try_get( + video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = parse_duration(self._html_search_meta( + 'duration', video_webpage, 'video duration')) + + # annotations + video_annotations = None + if self._downloader.params.get('writeannotations', False): + video_annotations = self._extract_annotations(video_id) + + chapters = self._extract_chapters(description_original, video_duration) + # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): dash_mpd_fatal = True @@ -2050,9 +2078,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': video_uploader_url, 'upload_date': upload_date, 'license': video_license, - 'creator': video_creator, + 'creator': video_creator or artist, 'title': video_title, - 'alt_title': video_alt_title, + 'alt_title': video_alt_title or track, 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, @@ -2075,6 +2103,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'series': series, 'season_number': season_number, 'episode_number': episode_number, + 'track': track, + 'artist': artist, } diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index 773073d85..fb167c198 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -13,6 +13,7 @@ from ..utils import ( ExtractorError, int_or_none, try_get, + url_or_none, urlencode_postdata, ) @@ -24,7 +25,7 @@ class ZattooBaseIE(InfoExtractor): _power_guide_hash = None def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if not username or not password: self.raise_login_required( 'A valid %s account is needed to access this media.' @@ -150,8 +151,8 @@ class ZattooBaseIE(InfoExtractor): for watch in watch_urls: if not isinstance(watch, dict): continue - watch_url = watch.get('url') - if not watch_url or not isinstance(watch_url, compat_str): + watch_url = url_or_none(watch.get('url')) + if not watch_url: continue format_id_list = [stream_type] maxrate = watch.get('maxrate') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index bb9020c91..afa3f6c47 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -15,6 +15,7 @@ from ..utils import ( try_get, unified_timestamp, update_url_query, + url_or_none, urljoin, ) @@ -67,8 +68,8 @@ class ZDFIE(ZDFBaseIE): def _extract_subtitles(src): subtitles = {} for caption in try_get(src, lambda x: x['captions'], list) or []: - subtitle_url = caption.get('uri') - if subtitle_url and isinstance(subtitle_url, compat_str): + subtitle_url = url_or_none(caption.get('uri')) + if subtitle_url: lang = caption.get('language', 'deu') subtitles.setdefault(lang, []).append({ 'url': subtitle_url, @@ -76,8 +77,8 @@ class ZDFIE(ZDFBaseIE): return subtitles def _extract_format(self, video_id, formats, format_urls, meta): - format_url = meta.get('url') - if not format_url or not isinstance(format_url, compat_str): + format_url = url_or_none(meta.get('url')) + if not format_url: return if format_url in format_urls: return @@ -152,7 +153,8 @@ class ZDFIE(ZDFBaseIE): content, lambda x: x['teaserImageRef']['layouts'], dict) if layouts: for layout_key, layout_url in layouts.items(): - if not isinstance(layout_url, compat_str): + layout_url = url_or_none(layout_url) + if not layout_url: continue thumbnail = { 'url': layout_url, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index b692c6b3b..e7d8e8910 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -203,7 +203,7 @@ def parseOpts(overrideArguments=None): network.add_option( '--proxy', dest='proxy', default=None, metavar='URL', - help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable experimental ' + help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable ' 'SOCKS proxy, specify a proper scheme. For example ' 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") ' 'for direct connection') @@ -240,19 +240,19 @@ def parseOpts(overrideArguments=None): geo.add_option( '--geo-bypass', action='store_true', dest='geo_bypass', default=True, - help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + help='Bypass geographic restriction via faking X-Forwarded-For HTTP header') geo.add_option( '--no-geo-bypass', action='store_false', dest='geo_bypass', default=True, - help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header') geo.add_option( '--geo-bypass-country', metavar='CODE', dest='geo_bypass_country', default=None, - help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)') + help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code') geo.add_option( '--geo-bypass-ip-block', metavar='IP_BLOCK', dest='geo_bypass_ip_block', default=None, - help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation (experimental)') + help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation') selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( @@ -502,7 +502,7 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', - help='Set file xattribute ytdl.filesize with expected file size (experimental)') + help='Set file xattribute ytdl.filesize with expected file size') downloader.add_option( '--hls-prefer-native', dest='hls_prefer_native', action='store_true', default=None, @@ -841,11 +841,11 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--prefer-avconv', action='store_false', dest='prefer_ffmpeg', - help='Prefer avconv over ffmpeg for running the postprocessors (default)') + help='Prefer avconv over ffmpeg for running the postprocessors') postproc.add_option( '--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg', - help='Prefer ffmpeg over avconv for running the postprocessors') + help='Prefer ffmpeg over avconv for running the postprocessors (default)') postproc.add_option( '--ffmpeg-location', '--avconv-location', metavar='PATH', dest='ffmpeg_location', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 3ea1afcf3..757b496a1 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -77,7 +77,7 @@ class FFmpegPostProcessor(PostProcessor): def _determine_executables(self): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - prefer_ffmpeg = False + prefer_ffmpeg = True self.basename = None self.probe_basename = None @@ -85,7 +85,7 @@ class FFmpegPostProcessor(PostProcessor): self._paths = None self._versions = None if self._downloader: - prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False) + prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True) location = self._downloader.params.get('ffmpeg_location') if location is not None: if not os.path.exists(location): @@ -117,19 +117,19 @@ class FFmpegPostProcessor(PostProcessor): (p, get_exe_version(p, args=['-version'])) for p in programs) self._paths = dict((p, p) for p in programs) - if prefer_ffmpeg: - prefs = ('ffmpeg', 'avconv') - else: + if prefer_ffmpeg is False: prefs = ('avconv', 'ffmpeg') + else: + prefs = ('ffmpeg', 'avconv') for p in prefs: if self._versions[p]: self.basename = p break - if prefer_ffmpeg: - prefs = ('ffprobe', 'avprobe') - else: + if prefer_ffmpeg is False: prefs = ('avprobe', 'ffprobe') + else: + prefs = ('ffprobe', 'avprobe') for p in prefs: if self._versions[p]: self.probe_basename = p diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 608226578..22a962dc2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -82,7 +82,7 @@ def register_socks_protocols(): compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -184,6 +184,7 @@ DATE_FORMATS_MONTH_FIRST.extend([ ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" +JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>' def preferredencoding(): @@ -1230,7 +1231,7 @@ def unified_timestamp(date_str, day_first=True): def determine_ext(url, default_ext='unknown_video'): - if url is None: + if url is None or '.' not in url: return default_ext guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): @@ -1867,6 +1868,13 @@ def strip_or_none(v): return None if v is None else v.strip() +def url_or_none(url): + if not url or not isinstance(url, compat_str): + return None + url = url.strip() + return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + + def parse_duration(s): if not isinstance(s, compat_basestring): return None @@ -2274,13 +2282,16 @@ def parse_age_limit(s): return int(m.group('age')) if s in US_RATINGS: return US_RATINGS[s] - return TV_PARENTAL_GUIDELINES.get(s) + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) + if m: + return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] + return None def strip_jsonp(code): return re.sub( r'''(?sx)^ - (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+) + (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*) (?:\s*&&\s*(?P=func_name))? \s*\(\s*(?P<callback_data>.*)\);? \s*?(?://[^\n]*)*$''', @@ -2666,6 +2677,7 @@ def dfxp2srt(dfxp_data): ] _x = functools.partial(xpath_with_ns, ns_map={ + 'xml': 'http://www.w3.org/XML/1998/namespace', 'ttml': 'http://www.w3.org/ns/ttml', 'tts': 'http://www.w3.org/ns/ttml#styling', }) @@ -2757,7 +2769,9 @@ def dfxp2srt(dfxp_data): repeat = False while True: for style in dfxp.findall(_x('.//ttml:style')): - style_id = style.get('id') + style_id = style.get('id') or style.get(_x('xml:id')) + if not style_id: + continue parent_style_id = style.get('style') if parent_style_id: if parent_style_id not in styles: @@ -3557,7 +3571,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): setattr(self, '%s_open' % type, lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: meth(r, proxy, type)) - return compat_urllib_request.ProxyHandler.__init__(self, proxies) + compat_urllib_request.ProxyHandler.__init__(self, proxies) def proxy_open(self, req, proxy, type): req_proxy = req.headers.get('Ytdl-request-proxy') diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6f47b1795..67394fa01 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.09' +__version__ = '2018.08.04'