diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index bf9494646..1fb878b59 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.06*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.06** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.05.10*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.05.10** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.04.06 +[debug] youtube-dl version 2016.05.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.gitignore b/.gitignore index 72c10425d..d5f216b5f 100644 --- a/.gitignore +++ b/.gitignore @@ -31,7 +31,9 @@ updates_key.pem *.part *.swp test/testdata +test/local_parameters.json .tox youtube-dl.zsh .idea .idea/* +tmp/ diff --git a/.travis.yml b/.travis.yml index cc21fae8f..998995845 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,9 @@ python: - "3.4" - "3.5" sudo: false +install: + - bash ./devscripts/install_srelay.sh + - export PATH=$PATH:$(pwd)/tmp/srelay-0.4.8b6 script: nosetests test --verbose notifications: email: diff --git a/AUTHORS b/AUTHORS index ea8d39978..5ca71ace7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -167,3 +167,8 @@ Kacper Michajłow José Joaquín Atria Viťas Strádal Kagami Hiiragi +Philip Huppert +blahgeek +Kevin Deldycke +inondle +Tomáš Čech diff --git a/Makefile b/Makefile index 06cffcb71..5d7cd5a7e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi *.mkv *.webm CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete find . -name "*.class" -delete @@ -37,7 +37,7 @@ test: ot: offlinetest offlinetest: codetest - $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py + $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py --exclude test_socks.py tar: youtube-dl.tar.gz diff --git a/README.md b/README.md index cd18edd87..4ef6b6d5a 100644 --- a/README.md +++ b/README.md @@ -85,9 +85,11 @@ which means you can modify it, redistribute it or use it however you like. --no-color Do not emit color codes in output ## Network Options: - --proxy URL Use the specified HTTP/HTTPS proxy. Pass in - an empty string (--proxy "") for direct - connection + --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. + To enable experimental SOCKS proxy, specify + a proper scheme. For example + socks5://127.0.0.1:1080/. Pass in an empty + string (--proxy "") for direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds --source-address IP Client-side IP address to bind to (experimental) @@ -176,7 +178,9 @@ which means you can modify it, redistribute it or use it however you like. --xattr-set-filesize Set file xattribute ytdl.filesize with expected filesize (experimental) --hls-prefer-native Use the native HLS downloader instead of - ffmpeg (experimental) + ffmpeg + --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS + downloader --hls-use-mpegts Use the mpegts container for HLS videos, allowing to play the video while downloading (some players may not be able @@ -463,7 +467,7 @@ The basic usage is not to set any template arguments when downloading a single f - `display_id`: An alternative identifier for the video - `uploader`: Full name of the video uploader - `license`: License name the video is licensed under - - `creator`: The main artist who created the video + - `creator`: The creator of the video - `release_date`: The date (YYYYMMDD) when the video was released - `timestamp`: UNIX timestamp of the moment the video became available - `upload_date`: Video upload date (YYYYMMDD) @@ -515,6 +519,18 @@ Available for the video that is an episode of some series or programme: - `episode_number`: Number of the video episode within a season - `episode_id`: Id of the video episode +Available for the media that is a track or a part of a music album: + - `track`: Title of the track + - `track_number`: Number of the track within an album or a disc + - `track_id`: Id of the track + - `artist`: Artist(s) of the track + - `genre`: Genre(s) of the track + - `album`: Title of the album the track belongs to + - `album_type`: Type of the album + - `album_artist`: List of all artists appeared on the album + - `disc_number`: Number of the disc or other physical medium the track belongs to + - `release_year`: Year (YYYY) when the album was released + Each aforementioned sequence when referenced in output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by particular extractor, such sequences will be replaced with `NA`. For example for `-o %(title)s-%(id)s.%(ext)s` and mp4 video with title `youtube-dl test video` and id `BaW_jenozKcj` this will result in a `youtube-dl test video-BaW_jenozKcj.mp4` file created in the current directory. @@ -683,6 +699,10 @@ YouTube changed their playlist format in March 2014 and later on, so you'll need If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update. +### I'm getting an error when trying to use output template: `error: using output template conflicts with using title, video ID or auto number` + +Make sure you are not using `-o` with any of these options `-t`, `--title`, `--id`, `-A` or `--auto-number` set in command line or in a configuration file. Remove the latter if any. + ### Do I always have to pass `-citw`? By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`. @@ -703,7 +723,7 @@ Videos or video formats streamed via RTMP protocol can only be downloaded when [ ### I have downloaded a video but how can I play it? -Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). +Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). ### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. diff --git a/devscripts/install_srelay.sh b/devscripts/install_srelay.sh new file mode 100755 index 000000000..33ce8a3f7 --- /dev/null +++ b/devscripts/install_srelay.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +mkdir -p tmp && cd tmp +wget -N http://downloads.sourceforge.net/project/socks-relay/socks-relay/srelay-0.4.8/srelay-0.4.8b6.tar.gz +tar zxvf srelay-0.4.8b6.tar.gz +cd srelay-0.4.8b6 +./configure +make diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 32e900660..c94c8b3ff 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,6 +50,7 @@ - **arte.tv:ddc** - **arte.tv:embed** - **arte.tv:future** + - **arte.tv:info** - **arte.tv:magazine** - **AtresPlayer** - **ATTTechChannel** @@ -76,6 +77,7 @@ - **Bild**: Bild.de - **BiliBili** - **BioBioChileTV** + - **BIQLE** - **BleacherReport** - **BleacherReportCMS** - **blinkx** @@ -115,6 +117,7 @@ - **Cinemassacre** - **Clipfish** - **cliphunter** + - **ClipRs** - **Clipsyndicate** - **cloudtime**: CloudTime - **Cloudy** @@ -143,6 +146,7 @@ - **culturebox.francetvinfo.fr** - **CultureUnplugged** - **CWTV** + - **DailyMail** - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** @@ -161,6 +165,7 @@ - **defense.gouv.fr** - **democracynow** - **DHM**: Filmarchiv - Deutsches Historisches Museum + - **DigitallySpeaking** - **Digiteka** - **Discovery** - **Dotsub** @@ -172,7 +177,6 @@ - **Dropbox** - **DrTuber** - **DRTV** - - **Dump** - **Dumpert** - **dvtv**: http://video.aktualne.cz/ - **dw** @@ -286,7 +290,6 @@ - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV - **Izlesene** - - **JadoreCettePub** - **JeuxVideo** - **Jove** - **jpopsuki.tv** @@ -324,6 +327,7 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LiTV** - **LiveLeak** - **livestream** - **livestream:original** @@ -337,26 +341,28 @@ - **mailru**: Видео@Mail.Ru - **MakersChannel** - **MakerTV** - - **Malemotion** - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** - **metacafe** - **Metacritic** - **Mgoon** + - **MGTV**: 芒果TV - **Minhateca** - **MinistryGrid** - **Minoto** - **miomio.tv** - **MiTele**: mitele.es - **mixcloud** + - **mixcloud:playlist** + - **mixcloud:stream** + - **mixcloud:user** - **MLB** - **Mnet** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - **Mojvideo** - **Moniker**: allmyvideos.net and vidspot.net - - **mooshare**: Mooshare.biz - **Morningstar**: morningstar.com - **Motherless** - **Motorsport**: motorsport.com @@ -371,8 +377,10 @@ - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** - - **muzu.tv** + - **mva**: Microsoft Virtual Academy videos + - **mva:course**: Microsoft Virtual Academy courses - **Mwave** + - **MwaveMeetGreet** - **MySpace** - **MySpace:album** - **MySpass** @@ -393,7 +401,6 @@ - **ndr:embed:base** - **NDTV** - **NerdCubedFeed** - - **Nerdist** - **netease:album**: 网易云音乐 - 专辑 - **netease:djradio**: 网易云音乐 - 电台 - **netease:mv**: 网易云音乐 - MV @@ -411,7 +418,8 @@ - **nfl.com** - **nhl.com** - **nhl.com:news**: NHL news - - **nhl.com:videocenter**: NHL videocenter category + - **nhl.com:videocenter** + - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** @@ -459,13 +467,14 @@ - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pcmag** - - **Periscope**: Periscope + - **People** + - **periscope**: Periscope + - **periscope:user**: Periscope user videos - **PhilharmonieDeParis**: Philharmonie de Paris - **phoenix.de** - **Photobucket** - **Pinkbike** - **Pladform** - - **PlanetaPlay** - **play.fm** - **played.to** - **PlaysTV** @@ -484,6 +493,7 @@ - **Pornotube** - **PornoVoisines** - **PornoXO** + - **PressTV** - **PrimeShareTV** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital @@ -494,7 +504,6 @@ - **qqmusic:playlist**: QQ音乐 - 歌单 - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - - **QuickVid** - **R7** - **radio.de** - **radiobremen** @@ -550,7 +559,6 @@ - **SenateISVP** - **ServingSys** - **Sexu** - - **SexyKarma**: Sexy Karma and Watch Indian Porn - **Shahid** - **Shared**: shared.sx and vivo.sx - **ShareSix** @@ -563,8 +571,6 @@ - **smotri:broadcast**: Smotri.com broadcasts - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos - - **SnagFilms** - - **SnagFilmsEmbed** - **Snotr** - **Sohu** - **soundcloud** @@ -606,8 +612,10 @@ - **Syfy** - **SztvHu** - **Tagesschau** + - **tagesschau:player** - **Tapely** - **Tass** + - **TDSLifeway** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos - **TeachingChannel** @@ -624,7 +632,6 @@ - **TeleTask** - **TF1** - **TheIntercept** - - **TheOnion** - **ThePlatform** - **ThePlatformFeed** - **TheScene** @@ -684,7 +691,6 @@ - **twitter** - **twitter:amplify** - **twitter:card** - - **Ubu** - **udemy** - **udemy:course** - **UDNEmbed**: 聯合影音 @@ -701,6 +707,7 @@ - **Vessel** - **Vesti**: Вести.Ru - **Vevo** + - **VevoPlaylist** - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** - **Vice** @@ -723,6 +730,8 @@ - **Vidzi** - **vier** - **vier:videos** + - **ViewLift** + - **ViewLiftEmbed** - **Viewster** - **Viidea** - **viki** @@ -754,7 +763,7 @@ - **Walla** - **WashingtonPost** - **wat.tv** - - **WayOfTheMaster** + - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile** - **WDRMaus**: Sendung mit der Maus @@ -771,9 +780,13 @@ - **WSJ**: Wall Street Journal - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me + - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To - **XHamster** - **XHamsterEmbed** + - **xiami:album**: 虾米音乐 - 专辑 + - **xiami:artist**: 虾米音乐 - 歌手 + - **xiami:collection**: 虾米音乐 - 精选集 + - **xiami:song**: 虾米音乐 - **XMinus** - **XNXX** - **Xstream** diff --git a/test/helper.py b/test/helper.py index f2d878212..dfee217a9 100644 --- a/test/helper.py +++ b/test/helper.py @@ -24,8 +24,13 @@ from youtube_dl.utils import ( def get_params(override=None): PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") + LOCAL_PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "local_parameters.json") with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: parameters = json.load(pf) + if os.path.exists(LOCAL_PARAMETERS_FILE): + with io.open(LOCAL_PARAMETERS_FILE, encoding='utf-8') as pf: + parameters.update(json.load(pf)) if override: parameters.update(override) return parameters @@ -143,6 +148,9 @@ def expect_value(self, got, expected, field): expect_value(self, item_got, item_expected, field) else: if isinstance(expected, compat_str) and expected.startswith('md5:'): + self.assertTrue( + isinstance(got, compat_str), + 'Expected field %s to be a unicode object, but got value %r of type %r' % (field, got, type(got))) got = 'md5:' + md5(got) elif isinstance(expected, compat_str) and expected.startswith('mincount:'): self.assertTrue( diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 938466a80..6404ac89f 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor +from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError class TestIE(InfoExtractor): @@ -66,5 +67,14 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._html_search_meta('e', html), '5') self.assertEqual(ie._html_search_meta('f', html), '6') + def test_download_json(self): + uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') + self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) + uri = encode_data_uri(b'callback({"foo": "blah"})', 'application/javascript') + self.assertEqual(self.ie._download_json(uri, None, transform_source=strip_jsonp), {'foo': 'blah'}) + uri = encode_data_uri(b'{"foo": invalid}', 'application/json') + self.assertRaises(ExtractorError, self.ie._download_json, uri, None) + self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) + if __name__ == '__main__': unittest.main() diff --git a/test/test_compat.py b/test/test_compat.py index 618668210..539b30540 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -10,13 +10,14 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, + compat_setenv, compat_etree_fromstring, compat_expanduser, compat_shlex_split, compat_str, + compat_struct_unpack, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, @@ -26,19 +27,22 @@ from youtube_dl.compat import ( class TestCompat(unittest.TestCase): def test_compat_getenv(self): test_str = 'тест' - os.environ['YOUTUBE-DL-TEST'] = ( - test_str if sys.version_info >= (3, 0) - else test_str.encode(get_filesystem_encoding())) + compat_setenv('YOUTUBE-DL-TEST', test_str) self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str) + def test_compat_setenv(self): + test_var = 'YOUTUBE-DL-TEST' + test_str = 'тест' + compat_setenv(test_var, test_str) + compat_getenv(test_var) + self.assertEqual(compat_getenv(test_var), test_str) + def test_compat_expanduser(self): old_home = os.environ.get('HOME') test_str = 'C:\Documents and Settings\тест\Application Data' - os.environ['HOME'] = ( - test_str if sys.version_info >= (3, 0) - else test_str.encode(get_filesystem_encoding())) + compat_setenv('HOME', test_str) self.assertEqual(compat_expanduser('~'), test_str) - os.environ['HOME'] = old_home + compat_setenv('HOME', old_home or '') def test_all_present(self): import youtube_dl.compat @@ -99,5 +103,9 @@ class TestCompat(unittest.TestCase): self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) + def test_struct_unpack(self): + self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,)) + + if __name__ == '__main__': unittest.main() diff --git a/test/test_socks.py b/test/test_socks.py new file mode 100644 index 000000000..d07003ceb --- /dev/null +++ b/test/test_socks.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import random +import subprocess + +from test.helper import ( + FakeYDL, + get_params, +) +from youtube_dl.compat import ( + compat_str, + compat_urllib_request, +) + + +class TestMultipleSocks(unittest.TestCase): + @staticmethod + def _check_params(attrs): + params = get_params() + for attr in attrs: + if attr not in params: + print('Missing %s. Skipping.' % attr) + return + return params + + def test_proxy_http(self): + params = self._check_params(['primary_proxy', 'primary_server_ip']) + if params is None: + return + ydl = FakeYDL({ + 'proxy': params['primary_proxy'] + }) + self.assertEqual( + ydl.urlopen('http://yt-dl.org/ip').read().decode('utf-8'), + params['primary_server_ip']) + + def test_proxy_https(self): + params = self._check_params(['primary_proxy', 'primary_server_ip']) + if params is None: + return + ydl = FakeYDL({ + 'proxy': params['primary_proxy'] + }) + self.assertEqual( + ydl.urlopen('https://yt-dl.org/ip').read().decode('utf-8'), + params['primary_server_ip']) + + def test_secondary_proxy_http(self): + params = self._check_params(['secondary_proxy', 'secondary_server_ip']) + if params is None: + return + ydl = FakeYDL() + req = compat_urllib_request.Request('http://yt-dl.org/ip') + req.add_header('Ytdl-request-proxy', params['secondary_proxy']) + self.assertEqual( + ydl.urlopen(req).read().decode('utf-8'), + params['secondary_server_ip']) + + def test_secondary_proxy_https(self): + params = self._check_params(['secondary_proxy', 'secondary_server_ip']) + if params is None: + return + ydl = FakeYDL() + req = compat_urllib_request.Request('https://yt-dl.org/ip') + req.add_header('Ytdl-request-proxy', params['secondary_proxy']) + self.assertEqual( + ydl.urlopen(req).read().decode('utf-8'), + params['secondary_server_ip']) + + +class TestSocks(unittest.TestCase): + def setUp(self): + self.port = random.randint(20000, 30000) + self.server_process = subprocess.Popen([ + 'srelay', '-f', '-i', '127.0.0.1:%d' % self.port], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def tearDown(self): + self.server_process.terminate() + self.server_process.communicate() + + def _get_ip(self, protocol): + ydl = FakeYDL({ + 'proxy': '%s://127.0.0.1:%d' % (protocol, self.port), + }) + return ydl.urlopen('http://yt-dl.org/ip').read().decode('utf-8') + + def test_socks4(self): + self.assertTrue(isinstance(self._get_ip('socks4'), compat_str)) + + def test_socks4a(self): + self.assertTrue(isinstance(self._get_ip('socks4a'), compat_str)) + + def test_socks5(self): + self.assertTrue(isinstance(self._get_ip('socks5'), compat_str)) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index a35debfe1..ca254779f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -20,6 +20,7 @@ from youtube_dl.utils import ( args_to_str, encode_base_n, clean_html, + date_from_str, DateRange, detect_exe_version, determine_ext, @@ -54,7 +55,6 @@ from youtube_dl.utils import ( smuggle_url, str_to_int, strip_jsonp, - struct_unpack, timeconvert, unescapeHTML, unified_strdate, @@ -138,8 +138,8 @@ class TestUtil(unittest.TestCase): self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True)) self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True)) - tests = 'a\xe4b\u4e2d\u56fd\u7684c' - self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c') + tests = 'aäb\u4e2d\u56fd\u7684c' + self.assertEqual(sanitize_filename(tests, restricted=True), 'aab_c') self.assertTrue(sanitize_filename('\xf6', restricted=True) != '') # No empty filename forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#' @@ -154,6 +154,10 @@ class TestUtil(unittest.TestCase): self.assertTrue(sanitize_filename('-', restricted=True) != '') self.assertTrue(sanitize_filename(':', restricted=True) != '') + self.assertEqual(sanitize_filename( + 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ', restricted=True), + 'AAAAAAAECEEEEIIIIDNOOOOOOOEUUUUYPssaaaaaaaeceeeeiiiionoooooooeuuuuypy') + def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') @@ -234,6 +238,13 @@ class TestUtil(unittest.TestCase): self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('�'), '�') + def test_date_from_str(self): + self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) + self.assertEqual(date_from_str('now+7day'), date_from_str('now+1week')) + self.assertEqual(date_from_str('now+14day'), date_from_str('now+2week')) + self.assertEqual(date_from_str('now+365day'), date_from_str('now+1year')) + self.assertEqual(date_from_str('now+30day'), date_from_str('now+1month')) + def test_daterange(self): _20century = DateRange("19000101", "20000101") self.assertFalse("17890714" in _20century) @@ -405,6 +416,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('01:02:03:04'), 93784) self.assertEqual(parse_duration('1 hour 3 minutes'), 3780) self.assertEqual(parse_duration('87 Min.'), 5220) + self.assertEqual(parse_duration('PT1H0.040S'), 3600.04) def test_fix_xml_ampersands(self): self.assertEqual( @@ -444,9 +456,6 @@ class TestUtil(unittest.TestCase): testPL(5, 2, (2, 99), [2, 3, 4]) testPL(5, 2, (20, 99), []) - def test_struct_unpack(self): - self.assertEqual(struct_unpack('!B', b'\x00'), (0,)) - def test_read_batch_urls(self): f = io.StringIO('''\xef\xbb\xbf foo bar\r diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 47df0f348..af1c45421 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -44,7 +44,7 @@ class TestYoutubeLists(unittest.TestCase): ie = YoutubePlaylistIE(dl) result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w') entries = result['entries'] - self.assertTrue(len(entries) >= 20) + self.assertTrue(len(entries) >= 50) original_video = entries[0] self.assertEqual(original_video['id'], 'OQpdSVF_k_w') diff --git a/tox.ini b/tox.ini index 2d7134005..9c4e4a3d1 100644 --- a/tox.ini +++ b/tox.ini @@ -9,5 +9,6 @@ passenv = HOME defaultargs = test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py + --exclude test_socks.py commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=youtube_dl --cover-html # test.test_download:TestDownload.test_NowVideo diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a89a71a25..03a6a1890 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -64,6 +64,7 @@ from .utils import ( PostProcessingError, preferredencoding, prepend_extension, + register_socks_protocols, render_table, replace_extension, SameFileError, @@ -260,7 +261,9 @@ class YoutubeDL(object): The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. None or unset for standard (built-in) downloader. - hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv. + hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv + if True, otherwise use ffmpeg/avconv if False, otherwise + use downloader suggested by extractor if None. The following parameters are not used by YoutubeDL itself, they are used by the downloader (see youtube_dl/downloader/common.py): @@ -359,6 +362,8 @@ class YoutubeDL(object): for ph in self.params.get('progress_hooks', []): self.add_progress_hook(ph) + register_socks_protocols() + def warn_if_short_id(self, argv): # short YouTube ID starting with dash? idxs = [ @@ -578,7 +583,7 @@ class YoutubeDL(object): is_id=(k == 'id')) template_dict = dict((k, sanitize(k, v)) for k, v in template_dict.items() - if v is not None) + if v is not None and not isinstance(v, (list, tuple, dict))) template_dict = collections.defaultdict(lambda: 'NA', template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) @@ -715,6 +720,7 @@ class YoutubeDL(object): result_type = ie_result.get('_type', 'video') if result_type in ('url', 'url_transparent'): + ie_result['url'] = sanitize_url(ie_result['url']) extract_flat = self.params.get('extract_flat', False) if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or extract_flat is True): @@ -1637,7 +1643,7 @@ class YoutubeDL(object): # Just a single file success = dl(filename, info_dict) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_error('unable to download video data: %s' % str(err)) + self.report_error('unable to download video data: %s' % error_to_compat_str(err)) return except (OSError, IOError) as err: raise UnavailableVideoError(err) @@ -2016,6 +2022,7 @@ class YoutubeDL(object): if opts_cookiefile is None: self.cookiejar = compat_cookiejar.CookieJar() else: + opts_cookiefile = compat_expanduser(opts_cookiefile) self.cookiejar = compat_cookiejar.MozillaCookieJar( opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 737f6545d..5df965191 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -67,9 +67,9 @@ def _real_main(argv=None): # Custom HTTP headers if opts.headers is not None: for h in opts.headers: - if h.find(':', 1) < 0: + if ':' not in h: parser.error('wrong header formatting, it should be key:value, not "%s"' % h) - key, value = h.split(':', 2) + key, value = h.split(':', 1) if opts.verbose: write_string('[debug] Adding header from command line option %s:%s\n' % (key, value)) std_headers[key] = value @@ -86,7 +86,9 @@ def _real_main(argv=None): if opts.batchfile == '-': batchfd = sys.stdin else: - batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batchfd = io.open( + compat_expanduser(opts.batchfile), + 'r', encoding='utf-8', errors='ignore') batch_urls = read_batch_urls(batchfd) if opts.verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') @@ -404,7 +406,7 @@ def _real_main(argv=None): try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(opts.load_info_filename) + retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename)) else: retcode = ydl.download(all_urls) except MaxDownloadsReached: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0b6c5ca7a..1392361a1 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -11,6 +11,7 @@ import re import shlex import shutil import socket +import struct import subprocess import sys import itertools @@ -340,9 +341,9 @@ except ImportError: # Python 2 return parsed_result try: - from shlex import quote as shlex_quote + from shlex import quote as compat_shlex_quote except ImportError: # Python < 3.3 - def shlex_quote(s): + def compat_shlex_quote(s): if re.match(r'^[-_\w./]+$', s): return s else: @@ -373,6 +374,9 @@ compat_os_name = os._name if os.name == 'java' else os.name if sys.version_info >= (3, 0): compat_getenv = os.getenv compat_expanduser = os.path.expanduser + + def compat_setenv(key, value, env=os.environ): + env[key] = value else: # Environment variables should be decoded with filesystem encoding. # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) @@ -384,6 +388,12 @@ else: env = env.decode(get_filesystem_encoding()) return env + def compat_setenv(key, value, env=os.environ): + def encode(v): + from .utils import get_filesystem_encoding + return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v + env[encode(key)] = encode(value) + # HACK: The default implementations of os.path.expanduser from cpython do not decode # environment variables with filesystem encoding. We will work around this by # providing adjusted implementations. @@ -456,18 +466,6 @@ else: print(s) -try: - subprocess_check_output = subprocess.check_output -except AttributeError: - def subprocess_check_output(*args, **kwargs): - assert 'input' not in kwargs - p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs) - output, _ = p.communicate() - ret = p.poll() - if ret: - raise subprocess.CalledProcessError(ret, p.args, output=output) - return output - if sys.version_info < (3, 0) and sys.platform == 'win32': def compat_getpass(prompt, *args, **kwargs): if isinstance(prompt, compat_str): @@ -583,6 +581,26 @@ if sys.version_info >= (3, 0): else: from tokenize import generate_tokens as compat_tokenize_tokenize + +try: + struct.pack('!I', 0) +except TypeError: + # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument + # See https://bugs.python.org/issue19099 + def compat_struct_pack(spec, *args): + if isinstance(spec, compat_str): + spec = spec.encode('ascii') + return struct.pack(spec, *args) + + def compat_struct_unpack(spec, *args): + if isinstance(spec, compat_str): + spec = spec.encode('ascii') + return struct.unpack(spec, *args) +else: + compat_struct_pack = struct.pack + compat_struct_unpack = struct.unpack + + __all__ = [ 'compat_HTMLParser', 'compat_HTTPError', @@ -604,9 +622,13 @@ __all__ = [ 'compat_os_name', 'compat_parse_qs', 'compat_print', + 'compat_setenv', + 'compat_shlex_quote', 'compat_shlex_split', 'compat_socket_create_connection', 'compat_str', + 'compat_struct_pack', + 'compat_struct_unpack', 'compat_subprocess_get_DEVNULL', 'compat_tokenize_tokenize', 'compat_urllib_error', @@ -623,7 +645,5 @@ __all__ = [ 'compat_urlretrieve', 'compat_xml_parse_error', 'compat_xpath', - 'shlex_quote', - 'subprocess_check_output', 'workaround_optparse_bug9161', ] diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 73b34fdae..817591d97 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -41,9 +41,12 @@ def get_suitable_downloader(info_dict, params={}): if ed.can_download(info_dict): return ed - if protocol == 'm3u8' and params.get('hls_prefer_native'): + if protocol == 'm3u8' and params.get('hls_prefer_native') is True: return HlsFD + if protocol == 'm3u8_native' and params.get('hls_prefer_native') is False: + return FFmpegFD + return PROTOCOL_MAP.get(protocol, HttpFD) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 30277dc20..3a73cee1c 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -6,6 +6,7 @@ import sys import re from .common import FileDownloader +from ..compat import compat_setenv from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS from ..utils import ( cli_option, @@ -198,6 +199,18 @@ class FFmpegFD(ExternalFD): '-headers', ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + env = None + proxy = self.params.get('proxy') + if proxy: + if not re.match(r'^[\da-zA-Z]+://', proxy): + proxy = 'http://%s' % proxy + # Since December 2015 ffmpeg supports -http_proxy option (see + # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) + # We could switch to the following code if we are able to detect version properly + # args += ['-http_proxy', proxy] + env = os.environ.copy() + compat_setenv('HTTP_PROXY', proxy, env=env) + protocol = info_dict.get('protocol') if protocol == 'rtmp': @@ -224,8 +237,8 @@ class FFmpegFD(ExternalFD): args += ['-rtmp_live', 'live'] args += ['-i', url, '-c', 'copy'] - if protocol == 'm3u8': - if self.params.get('hls_use_mpegts', False): + if protocol in ('m3u8', 'm3u8_native'): + if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] else: args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] @@ -239,7 +252,7 @@ class FFmpegFD(ExternalFD): self._debug_cmd(args) - proc = subprocess.Popen(args, stdin=subprocess.PIPE) + proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) try: retval = proc.wait() except KeyboardInterrupt: diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 664d87543..314def4cb 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -12,37 +12,49 @@ from ..compat import ( compat_urlparse, compat_urllib_error, compat_urllib_parse_urlparse, + compat_struct_pack, + compat_struct_unpack, ) from ..utils import ( encodeFilename, fix_xml_ampersands, sanitize_open, - struct_pack, - struct_unpack, xpath_text, ) +class DataTruncatedError(Exception): + pass + + class FlvReader(io.BytesIO): """ Reader for Flv files The file format is documented in https://www.adobe.com/devnet/f4v.html """ + def read_bytes(self, n): + data = self.read(n) + if len(data) < n: + raise DataTruncatedError( + 'FlvReader error: need %d bytes while only %d bytes got' % ( + n, len(data))) + return data + # Utility functions for reading numbers and strings def read_unsigned_long_long(self): - return struct_unpack('!Q', self.read(8))[0] + return compat_struct_unpack('!Q', self.read_bytes(8))[0] def read_unsigned_int(self): - return struct_unpack('!I', self.read(4))[0] + return compat_struct_unpack('!I', self.read_bytes(4))[0] def read_unsigned_char(self): - return struct_unpack('!B', self.read(1))[0] + return compat_struct_unpack('!B', self.read_bytes(1))[0] def read_string(self): res = b'' while True: - char = self.read(1) + char = self.read_bytes(1) if char == b'\x00': break res += char @@ -53,18 +65,18 @@ class FlvReader(io.BytesIO): Read a box and return the info as a tuple: (box_size, box_type, box_data) """ real_size = size = self.read_unsigned_int() - box_type = self.read(4) + box_type = self.read_bytes(4) header_end = 8 if size == 1: real_size = self.read_unsigned_long_long() header_end = 16 - return real_size, box_type, self.read(real_size - header_end) + return real_size, box_type, self.read_bytes(real_size - header_end) def read_asrt(self): # version self.read_unsigned_char() # flags - self.read(3) + self.read_bytes(3) quality_entry_count = self.read_unsigned_char() # QualityEntryCount for i in range(quality_entry_count): @@ -85,7 +97,7 @@ class FlvReader(io.BytesIO): # version self.read_unsigned_char() # flags - self.read(3) + self.read_bytes(3) # time scale self.read_unsigned_int() @@ -119,7 +131,7 @@ class FlvReader(io.BytesIO): # version self.read_unsigned_char() # flags - self.read(3) + self.read_bytes(3) self.read_unsigned_int() # BootstrapinfoVersion # Profile,Live,Update,Reserved @@ -194,11 +206,11 @@ def build_fragments_list(boot_info): def write_unsigned_int(stream, val): - stream.write(struct_pack('!I', val)) + stream.write(compat_struct_pack('!I', val)) def write_unsigned_int_24(stream, val): - stream.write(struct_pack('!I', val)[1:]) + stream.write(compat_struct_pack('!I', val)[1:]) def write_flv_header(stream): @@ -374,7 +386,17 @@ class F4mFD(FragmentFD): down.close() reader = FlvReader(down_data) while True: - _, box_type, box_data = reader.read_box_info() + try: + _, box_type, box_data = reader.read_box_info() + except DataTruncatedError: + if test: + # In tests, segments may be truncated, and thus + # FlvReader may not be able to parse the whole + # chunk. If so, write the segment as is + # See https://github.com/rg3/youtube-dl/issues/9214 + dest_stream.write(down_data) + break + raise if box_type == b'mdat': dest_stream.write(box_data) break diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a01dac031..62136ee54 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -4,6 +4,7 @@ import os.path import re from .fragment import FragmentFD +from .external import FFmpegFD from ..compat import compat_urlparse from ..utils import ( @@ -17,12 +18,39 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' + @staticmethod + def can_download(manifest): + UNSUPPORTED_FEATURES = ( + r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1] + r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] + # Live streams heuristic does not always work (e.g. geo restricted to Germany + # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) + # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of + # event media playlists [4] + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 + # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 + ) + return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) + def real_download(self, filename, info_dict): man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) manifest = self.ydl.urlopen(man_url).read() s = manifest.decode('utf-8', 'ignore') + + if not self.can_download(s): + self.report_warning( + 'hlsnative has detected features it does not support, ' + 'extraction will be delegated to ffmpeg') + fd = FFmpegFD(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + return fd.real_download(filename, info_dict) + fragment_urls = [] for line in s.splitlines(): line = line.strip() diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py index 3eb29526c..939358b2a 100644 --- a/youtube_dl/downloader/rtsp.py +++ b/youtube_dl/downloader/rtsp.py @@ -27,6 +27,8 @@ class RtspFD(FileDownloader): self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.') return False + self._debug_cmd(args) + retval = subprocess.call(args) if retval == 0: fsize = os.path.getsize(encodeFilename(tmpfilename)) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index 95a99c6b0..42c21bf41 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -1,26 +1,113 @@ +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P[0-9]+)(?:$|\?)' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/(?:[^/]+/)*(?:[^/?#&]+-)?)(?P[^/?#&]+)' _TESTS = [{ + # video with 5min ID 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', 'md5': '18ef68f48740e86ae94b98da815eec42', 'info_dict': { 'id': '518167793', 'ext': 'mp4', 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', + 'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.', + 'timestamp': 1395405060, + 'upload_date': '20140321', + 'uploader': 'Newsy Studio', }, - 'add_ie': ['FiveMin'], + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # video with vidible ID + 'url': 'http://on.aol.com/video/netflix-is-raising-rates-5707d6b8e4b090497b04f706?context=PC:homepage:PL1944:1460189336183', + 'info_dict': { + 'id': '5707d6b8e4b090497b04f706', + 'ext': 'mp4', + 'title': 'Netflix is Raising Rates', + 'description': 'Netflix is rewarding millions of it’s long-standing members with an increase in cost. Veuer’s Carly Figueroa has more.', + 'upload_date': '20160408', + 'timestamp': 1460123280, + 'uploader': 'Veuer', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://on.aol.com/partners/abc-551438d309eab105804dbfe8/sneak-peek-was-haley-really-framed-570eaebee4b0448640a5c944', + 'only_matching': True, + }, { + 'url': 'http://on.aol.com/shows/park-bench-shw518173474-559a1b9be4b0c3bfad3357a7?context=SH:SHW518173474:PL4327:1460619712763', + 'only_matching': True, + }, { + 'url': 'http://on.aol.com/video/519442220', + 'only_matching': True, + }, { + 'url': 'aol-video:5707d6b8e4b090497b04f706', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - return self.url_result('5min:%s' % video_id) + + response = self._download_json( + 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, + video_id)['response'] + if response['statusText'] != 'Ok': + raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True) + + video_data = response['data'] + formats = [] + m3u8_url = video_data.get('videoMasterPlaylist') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + for rendition in video_data.get('renditions', []): + video_url = rendition.get('url') + if not video_url: + continue + ext = rendition.get('format') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + f = { + 'url': video_url, + 'format_id': rendition.get('quality'), + } + mobj = re.search(r'(\d+)x(\d+)', video_url) + if mobj: + f.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + formats.append(f) + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + + return { + 'id': video_id, + 'title': video_data['title'], + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('publishDate')), + 'view_count': int_or_none(video_data.get('views')), + 'description': video_data.get('description'), + 'uploader': video_data.get('videoOwner'), + 'formats': formats, + } class AolFeaturesIE(InfoExtractor): diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 9fb84911a..26446c2fe 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -83,7 +83,7 @@ class ARDMediathekIE(InfoExtractor): subtitle_url = media_info.get('_subtitleUrl') if subtitle_url: subtitles['de'] = [{ - 'ext': 'srt', + 'ext': 'ttml', 'url': subtitle_url, }] diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index ae0f27dcb..e37fdae13 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -63,7 +63,7 @@ class ArteTvIE(InfoExtractor): class ArteTVPlus7IE(InfoExtractor): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?Pfr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P[^/]+)/(?P[^/?#&+])' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?Pfr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P[^/]+)/(?P[^/?#&]+)' @classmethod def _extract_url_info(cls, url): @@ -161,24 +161,53 @@ class ArteTVPlus7IE(InfoExtractor): 'es': 'E[ESP]', } + langcode = LANGS.get(lang, lang) + formats = [] for format_id, format_dict in player_info['VSR'].items(): f = dict(format_dict) versionCode = f.get('versionCode') - langcode = LANGS.get(lang, lang) - lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] - lang_pref = None - if versionCode: - matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] - lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) - source_pref = 0 - if versionCode is not None: - # The original version with subtitles has lower relevance - if re.match(r'VO-ST(F|A|E)', versionCode): - source_pref -= 10 - # The version with sourds/mal subtitles has also lower relevance - elif re.match(r'VO?(F|A|E)-STM\1', versionCode): - source_pref -= 9 + l = re.escape(langcode) + + # Language preference from most to least priority + # Reference: section 5.6.3 of + # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf + PREFERENCES = ( + # original version in requested language, without subtitles + r'VO{0}$'.format(l), + # original version in requested language, with partial subtitles in requested language + r'VO{0}-ST{0}$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO{0}-STM{0}$'.format(l), + # non-original (dubbed) version in requested language, without subtitles + r'V{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language + r'V{0}-ST{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'V{0}-STM{0}$'.format(l), + # original version in requested language, with partial subtitles in different language + r'VO{0}-ST(?!{0}).+?$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language + r'VO{0}-STM(?!{0}).+?$'.format(l), + # original version in different language, with partial subtitles in requested language + r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), + # original version in different language, without subtitles + r'VO(?:(?!{0}))?$'.format(l), + # original version in different language, with partial subtitles in different language + r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in different language + r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), + ) + + for pref, p in enumerate(PREFERENCES): + if re.match(p, versionCode): + lang_pref = len(PREFERENCES) - pref + break + else: + lang_pref = -1 + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -188,7 +217,6 @@ class ArteTVPlus7IE(InfoExtractor): 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), 'quality': qfunc(f.get('quality')), - 'source_preference': source_pref, } if f.get('mediaType') == 'rtmp': @@ -210,7 +238,7 @@ class ArteTVPlus7IE(InfoExtractor): # It also uses the arte_vp_url url from the webpage to extract the information class ArteTVCreativeIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:creative' - _VALID_URL = r'https?://creative\.arte\.tv/(?Pfr|de|en|es)/(?:magazine?/)?(?P[^/?#&]+)' + _VALID_URL = r'https?://creative\.arte\.tv/(?Pfr|de|en|es)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', @@ -229,9 +257,27 @@ class ArteTVCreativeIE(ArteTVPlus7IE): 'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n', 'upload_date': '20140805', } + }, { + 'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde', + 'only_matching': True, }] +class ArteTVInfoIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:info' + _VALID_URL = r'https?://info\.arte\.tv/(?Pfr|de|en|es)/(?:[^/]+/)*(?P[^/?#&]+)' + + _TEST = { + 'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere', + 'info_dict': { + 'id': '067528-000-A', + 'ext': 'mp4', + 'title': 'Service civique, un cache misère ?', + 'upload_date': '20160403', + }, + } + + class ArteTVFutureIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:future' _VALID_URL = r'https?://future\.arte\.tv/(?Pfr|de|en|es)/(?P[^/?#&]+)' @@ -337,7 +383,7 @@ class ArteTVEmbedIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:embed' _VALID_URL = r'''(?x) http://www\.arte\.tv - /playerv2/embed\.php\?json_url= + /(?:playerv2/embed|arte_vp/index)\.php\?json_url= (?P http://arte\.tv/papi/tvguide/videos/stream/player/ (?P[^/]+)/(?P[^/]+)[^&]* diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 3eed91279..a52d26cec 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -30,14 +30,14 @@ class AudiomackIE(InfoExtractor): # audiomack wrapper around soundcloud song { 'add_ie': ['Soundcloud'], - 'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare', + 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', 'info_dict': { - 'id': '172419696', + 'id': '258901379', 'ext': 'mp3', - 'description': 'md5:1fc3272ed7a635cce5be1568c2822997', - 'title': 'Young Thug ft Lil Wayne - Take Kare', - 'uploader': 'Young Thug World', - 'upload_date': '20141016', + 'description': 'mamba day freestyle for the legend Kobe Bryant ', + 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', + 'uploader': 'ILOVEMAKONNEN', + 'upload_date': '20160414', } }, ] diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 425f08f2b..74c4510f9 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -671,6 +671,7 @@ class BBCIE(BBCCoUkIE): 'info_dict': { 'id': '34475836', 'title': 'Jurgen Klopp: Furious football from a witty and winning coach', + 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', }, 'playlist_count': 3, }, { diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py new file mode 100644 index 000000000..ae4579b33 --- /dev/null +++ b/youtube_dl/extractor/biqle.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BIQLEIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P-?\d+_\d+)' + _TESTS = [{ + 'url': 'http://www.biqle.ru/watch/847655_160197695', + 'md5': 'ad5f746a874ccded7b8f211aeea96637', + 'info_dict': { + 'id': '160197695', + 'ext': 'mp4', + 'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)', + 'uploader': 'Andrey Rogozin', + 'upload_date': '20110605', + } + }, { + 'url': 'https://biqle.org/watch/-44781847_168547604', + 'md5': '7f24e72af1db0edf7c1aaba513174f97', + 'info_dict': { + 'id': '168547604', + 'ext': 'mp4', + 'title': 'Ребенок в шоке от автоматической мойки', + 'uploader': 'Dmitry Kotov', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + embed_url = self._proto_relative_url(self._search_regex( + r'', webpage, 'embed url')) + + return { + '_type': 'url_transparent', + 'url': embed_url, + } diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 13343bc25..bd538be50 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -17,6 +17,9 @@ class BloombergIE(InfoExtractor): 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', 'description': 'md5:a8ba0302912d03d246979735c17d2761', }, + 'params': { + 'format': 'best[format_id^=hds]', + }, }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index c718cf385..fc7fc5b16 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -307,9 +307,10 @@ class BrightcoveLegacyIE(InfoExtractor): playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): + video_id = compat_str(video_info['id']) publisher_id = video_info.get('publisherId') info = { - 'id': compat_str(video_info['id']), + 'id': video_id, 'title': video_info['displayName'].strip(), 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), @@ -331,7 +332,8 @@ class BrightcoveLegacyIE(InfoExtractor): url_comp = compat_urllib_parse_urlparse(url) if url_comp.path.endswith('.m3u8'): formats.extend( - self._extract_m3u8_formats(url, info['id'], 'mp4')) + self._extract_m3u8_formats( + url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) continue elif 'akamaihd.net' in url_comp.netloc: # This type of renditions are served through @@ -340,7 +342,7 @@ class BrightcoveLegacyIE(InfoExtractor): ext = 'flv' if ext is None: ext = determine_ext(url) - tbr = int_or_none(rend.get('encodingRate'), 1000), + tbr = int_or_none(rend.get('encodingRate'), 1000) a_format = { 'format_id': 'http%s' % ('-%s' % tbr if tbr else ''), 'url': url, @@ -365,7 +367,7 @@ class BrightcoveLegacyIE(InfoExtractor): a_format.update({ 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''), 'ext': 'mp4', - 'protocol': 'm3u8', + 'protocol': 'm3u8_native', }) formats.append(a_format) @@ -395,7 +397,7 @@ class BrightcoveLegacyIE(InfoExtractor): return ad_info if 'url' not in info and not info.get('formats'): - raise ExtractorError('Unable to extract video url for %s' % info['id']) + raise ExtractorError('Unable to extract video url for %s' % video_id) return info @@ -527,7 +529,7 @@ class BrightcoveNewIE(InfoExtractor): if not src: continue formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls', fatal=False)) + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif source_type == 'application/dash+xml': if not src: continue diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index d8aa31038..68a0633b6 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -33,6 +33,7 @@ class CBCIE(InfoExtractor): 'title': 'Robin Williams freestyles on 90 Minutes Live', 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', 'upload_date': '19700101', + 'uploader': 'CBCC-NEW', }, 'params': { # rtmp download diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index c621a08d5..051d783a2 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -5,7 +5,6 @@ from ..utils import ( xpath_text, xpath_element, int_or_none, - ExtractorError, find_xpath_attr, ) @@ -64,7 +63,7 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] - TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?manifest=m3u&mbr=true' + TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' def _real_extract(self, url): display_id = self._match_id(url) @@ -84,11 +83,11 @@ class CBSIE(CBSBaseIE): pid = xpath_text(item, 'pid') if not pid: continue - try: - tp_formats, tp_subtitles = self._extract_theplatform_smil( - self.TP_RELEASE_URL_TEMPLATE % pid, content_id, 'Downloading %s SMIL data' % pid) - except ExtractorError: - continue + tp_release_url = self.TP_RELEASE_URL_TEMPLATE % pid + if '.m3u8' in xpath_text(item, 'contentUrl', default=''): + tp_release_url += '&manifest=m3u' + tp_formats, tp_subtitles = self._extract_theplatform_smil( + tp_release_url, content_id, 'Downloading %s SMIL data' % pid) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index dda2c0959..8f7f09e22 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -1,13 +1,9 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, - parse_duration, - qualities, - unified_strdate, + parse_iso8601, ) @@ -19,14 +15,14 @@ class CCCIE(InfoExtractor): 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video', 'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'info_dict': { - 'id': '30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor', + 'id': '1839', 'ext': 'mp4', 'title': 'Introduction to Processor Design', - 'description': 'md5:80be298773966f66d56cb11260b879af', + 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac', 'thumbnail': 're:^https?://.*\.jpg$', - 'view_count': int, 'upload_date': '20131228', - 'duration': 3660, + 'timestamp': 1388188800, + 'duration': 3710, } }, { 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', @@ -34,79 +30,48 @@ class CCCIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + event_id = self._search_regex("data-id='(\d+)'", webpage, 'event id') + event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id) - if self._downloader.params.get('prefer_free_formats'): - preference = qualities(['mp3', 'opus', 'mp4-lq', 'webm-lq', 'h264-sd', 'mp4-sd', 'webm-sd', 'mp4', 'webm', 'mp4-hd', 'h264-hd', 'webm-hd']) - else: - preference = qualities(['opus', 'mp3', 'webm-lq', 'mp4-lq', 'webm-sd', 'h264-sd', 'mp4-sd', 'webm', 'mp4', 'webm-hd', 'mp4-hd', 'h264-hd']) - - title = self._html_search_regex( - r'(?s)

(.*?)

', webpage, 'title') - description = self._html_search_regex( - r'(?s)

About

(.+?)

', - webpage, 'description', fatal=False) - upload_date = unified_strdate(self._html_search_regex( - r"(?s)]+class='[^']*fa-calendar-o'[^>]*>(.+?)", - webpage, 'upload date', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r"(?s)(.*?)", - webpage, 'view count', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'(?s)]+class=(["\']).*?fa-clock-o.*?\1[^>]*>(?P.+?)(?P[^<]*)\s* - <(?:span|div)\s+class='label\s+filetype'>(?P[^<]*)\s* - [^']+)'>\s* - (?: - .*? - [^']+\.torrent)' - )?''', webpage) formats = [] - for m in matches: - format = m.group('format') - format_id = self._search_regex( - r'.*/([a-z0-9_-]+)/[^/]*$', - m.group('http_url'), 'format id', default=None) - if format_id: - format_id = m.group('lang') + '-' + format_id - vcodec = 'h264' if 'h264' in format_id else ( - 'none' if format_id in ('mp3', 'opus') else None + for recording in event_data.get('recordings', []): + recording_url = recording.get('recording_url') + if not recording_url: + continue + language = recording.get('language') + folder = recording.get('folder') + format_id = None + if language: + format_id = language + if folder: + if language: + format_id += '-' + folder + else: + format_id = folder + vcodec = 'h264' if 'h264' in folder else ( + 'none' if folder in ('mp3', 'opus') else None ) formats.append({ 'format_id': format_id, - 'format': format, - 'language': m.group('lang'), - 'url': m.group('http_url'), + 'url': recording_url, + 'width': int_or_none(recording.get('width')), + 'height': int_or_none(recording.get('height')), + 'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024), + 'language': language, 'vcodec': vcodec, - 'preference': preference(format_id), }) - - if m.group('torrent_url'): - formats.append({ - 'format_id': 'torrent-%s' % (format if format_id is None else format_id), - 'format': '%s (torrent)' % format, - 'proto': 'torrent', - 'format_note': '(unsupported; will just download the .torrent file)', - 'vcodec': vcodec, - 'preference': -100 + preference(format_id), - 'url': m.group('torrent_url'), - }) self._sort_formats(formats) - thumbnail = self._html_search_regex( - r"]+src="(?P(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', - ], - webpage, 'player data URL', default=None, group='url') - if not playerdata_url: - raise ExtractorError('Unable to find player data') - - video_title = self._html_search_regex( - r'(?P<title>.+?)\|', webpage, 'title') - video_description = self._html_search_regex( - r'<div class="entry-content">(?P<description>.+?)</div>', - webpage, 'description', flags=re.DOTALL, fatal=False) - video_thumbnail = self._og_search_thumbnail(webpage) - - return { - '_type': 'url_transparent', - 'display_id': display_id, - 'title': video_title, - 'description': video_description, - 'upload_date': video_date, - 'thumbnail': video_thumbnail, - 'url': playerdata_url, - } diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py new file mode 100644 index 000000000..4f9320ea5 --- /dev/null +++ b/youtube_dl/extractor/cliprs.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, +) + + +class ClipRsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+' + _TEST = { + 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', + 'md5': 'c412d57815ba07b56f9edc7b5d6a14e5', + 'info_dict': { + 'id': '1488842.1399140381', + 'ext': 'mp4', + 'title': 'PREMIJERA Frajle predstavljaju novi spot za pesmu Moli me, moli', + 'description': 'md5:56ce2c3b4ab31c5a2e0b17cb9a453026', + 'duration': 229, + 'timestamp': 1459850243, + 'upload_date': '20160405', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') + + response = self._download_json( + 'http://qi.ckm.onetapi.pl/', video_id, + query={ + 'body[id]': video_id, + 'body[jsonrpc]': '2.0', + 'body[method]': 'get_asset_detail', + 'body[params][ID_Publikacji]': video_id, + 'body[params][Service]': 'www.onet.pl', + 'content-type': 'application/jsonp', + 'x-onet-app': 'player.front.onetapi.pl', + }) + + error = response.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + + video = response['result'].get('0') + + formats = [] + for _, formats_dict in video['formats'].items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_list in formats_dict.items(): + if not isinstance(format_list, list): + continue + for f in format_list: + if not f.get('url'): + continue + formats.append({ + 'url': f['url'], + 'format_id': format_id, + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'abr': float_or_none(f.get('audio_bitrate')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + self._sort_formats(formats) + + meta = video.get('meta', {}) + + title = self._og_search_title(webpage, default=None) or meta['title'] + description = self._og_search_description(webpage, default=None) or meta.get('description') + duration = meta.get('length') or meta.get('lenght') + timestamp = parse_iso8601(meta.get('addDate'), ' ') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 9e267e6c0..9a28ef354 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -19,7 +19,7 @@ from ..utils import ( class CloudyIE(InfoExtractor): _IE_DESC = 'cloudy.ec and videoraj.ch' _VALID_URL = r'''(?x) - https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.ch)/ + https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.(?:ch|to))/ (?:v/|embed\.php\?id=) (?P<id>[A-Za-z0-9]+) ''' @@ -37,7 +37,7 @@ class CloudyIE(InfoExtractor): } }, { - 'url': 'http://www.videoraj.ch/v/47f399fd8bb60', + 'url': 'http://www.videoraj.to/v/47f399fd8bb60', 'md5': '7d0f8799d91efd4eda26587421c3c3b0', 'info_dict': { 'id': '47f399fd8bb60', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 17d00721c..0843d89af 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -163,7 +163,7 @@ class InfoExtractor(object): description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. - creator: The main artist who created the video. + creator: The creator of the video. release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). @@ -376,14 +376,13 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) - # data, headers and query params will be ignored for `Request` objects if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) else: if query: url_or_request = update_url_query(url_or_request, query) - if data or headers: + if data is not None or headers: url_or_request = sanitized_Request(url_or_request, data, headers) try: return self._downloader.urlopen(url_or_request) @@ -1007,6 +1006,13 @@ class InfoExtractor(object): def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), fatal=True): + # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy + akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') + if akamai_pv is not None and ';' in akamai_pv.text: + playerVerificationChallenge = akamai_pv.text.split(';')[0] + if playerVerificationChallenge.strip() != '': + return [] + formats = [] manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') @@ -1055,7 +1061,7 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True): + fatal=True, live=False): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -1133,7 +1139,11 @@ class InfoExtractor(object): if m3u8_id: format_id.append(m3u8_id) last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None - format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) + # Bandwidth of live streams may differ over time thus making + # format_id unpredictable. So it's better to keep provided + # format_id intact. + if not live: + format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), 'url': format_url(line.strip()), diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 8ae3f2890..90a64303d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -11,7 +11,6 @@ from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, - compat_urllib_parse_unquote, compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, @@ -27,6 +26,7 @@ from ..utils import ( unified_strdate, urlencode_postdata, xpath_text, + extract_attributes, ) from ..aes import ( aes_cbc_decrypt, @@ -306,28 +306,36 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', webpage, 'video_uploader', fatal=False) - playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) - playerdata_req = sanitized_Request(playerdata_url) - playerdata_req.data = urlencode_postdata({'current_page': webpage_url}) - playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') - - stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id') - video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False) - + available_fmts = [] + for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): + attrs = extract_attributes(a) + href = attrs.get('href') + if href and '/freetrial' in href: + continue + available_fmts.append(fmt) + if not available_fmts: + for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): + available_fmts = re.findall(p, webpage) + if available_fmts: + break + video_encode_ids = [] formats = [] - for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage): + for fmt in available_fmts: stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' streamdata_req = sanitized_Request( 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' - % (stream_id, stream_format, stream_quality), + % (video_id, stream_format, stream_quality), compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8')) streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata = self._download_xml( streamdata_req, video_id, note='Downloading media info for %s' % video_format) stream_info = streamdata.find('./{default}preload/stream_info') + video_encode_id = xpath_text(stream_info, './video_encode_id') + if video_encode_id in video_encode_ids: + continue + video_encode_ids.append(video_encode_id) video_url = xpath_text(stream_info, './host') video_play_path = xpath_text(stream_info, './file') if not video_url or not video_play_path: @@ -359,6 +367,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'ext': 'flv', }) formats.append(format_info) + self._sort_formats(formats) + + metadata = self._download_xml( + 'http://www.crunchyroll.com/xml', video_id, + note='Downloading media info', query={ + 'req': 'RpcApiVideoPlayer_GetMediaMetadata', + 'media_id': video_id, + }) subtitles = self.extract_subtitles(video_id, webpage) @@ -366,9 +382,12 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'id': video_id, 'title': video_title, 'description': video_description, - 'thumbnail': video_thumbnail, + 'thumbnail': xpath_text(metadata, 'episode_image_url'), 'uploader': video_uploader, 'upload_date': video_upload_date, + 'series': xpath_text(metadata, 'series_title'), + 'episode': xpath_text(metadata, 'episode_title'), + 'episode_number': int_or_none(xpath_text(metadata, 'episode_number')), 'subtitles': subtitles, 'formats': formats, } diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index f5cefd966..ebd14cb16 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -9,7 +9,7 @@ from ..utils import ( class CWTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/shows/(?:[^/]+/){2}\?play=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' + _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/(?:shows/)?(?:[^/]+/){2}\?.*\bplay=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' _TESTS = [{ 'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63', 'info_dict': { @@ -48,6 +48,9 @@ class CWTVIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py new file mode 100644 index 000000000..b60a1d813 --- /dev/null +++ b/youtube_dl/extractor/dailymail.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_protocol, +) + + +class DailyMailIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', + 'md5': '2f639d446394f53f3a33658b518b6615', + 'info_dict': { + 'id': '1288527', + 'ext': 'mp4', + 'title': 'Turn any video into an impressionist masterpiece', + 'description': 'md5:88ddbcb504367987b2708bb38677c9d2', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = self._parse_json(self._search_regex( + r"data-opts='({.+?})'", webpage, 'video data'), video_id) + title = video_data['title'] + video_sources = self._download_json(video_data.get( + 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + + formats = [] + for rendition in video_sources['renditions']: + rendition_url = rendition.get('url') + if not rendition_url: + continue + tbr = int_or_none(rendition.get('encodingRate'), 1000) + container = rendition.get('videoContainer') + is_hls = container == 'M2TS' + protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url}) + formats.append({ + 'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''), + 'url': rendition_url, + 'width': int_or_none(rendition.get('frameWidth')), + 'height': int_or_none(rendition.get('frameHeight')), + 'tbr': tbr, + 'vcodec': rendition.get('videoCodec'), + 'container': container, + 'protocol': protocol, + 'ext': 'mp4' if is_hls else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('descr'), + 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index cdfeccacb..a4d0448c2 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -12,39 +12,46 @@ class DFBIE(InfoExtractor): _TEST = { 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/', - # The md5 is different each time + 'md5': 'ac0f98a52a330f700b4b3034ad240649', 'info_dict': { 'id': '11633', 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland', 'upload_date': '20150714', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) player_info = self._download_xml( 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, display_id) video_info = player_info.find('video') + stream_access_url = self._proto_relative_url(video_info.find('url').text.strip()) - f4m_info = self._download_xml( - self._proto_relative_url(video_info.find('url').text.strip()), display_id) - token_el = f4m_info.find('token') - manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' - formats = self._extract_f4m_formats(manifest_url, display_id) + formats = [] + # see http://tv.dfb.de/player/js/ajax.js for the method to extract m3u8 formats + for sa_url in (stream_access_url, stream_access_url + '&area=&format=iphone'): + stream_access_info = self._download_xml(sa_url, display_id) + token_el = stream_access_info.find('token') + manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + if '.f4m' in manifest_url: + formats.extend(self._extract_f4m_formats( + manifest_url + '&hdcore=3.2.0', + display_id, f4m_id='hds', fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + manifest_url, display_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': video_info.find('title').text, - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id, 'upload_date': unified_strdate(video_info.find('time_date').text), 'formats': formats, } diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 5f1275b39..55853f76f 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -33,6 +33,7 @@ class DiscoveryIE(InfoExtractor): 'duration': 156, 'timestamp': 1302032462, 'upload_date': '20110405', + 'uploader_id': '103207', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -54,7 +55,11 @@ class DiscoveryIE(InfoExtractor): 'upload_date': '20140725', 'timestamp': 1406246400, 'duration': 116, + 'uploader_id': '103207', }, + 'params': { + 'skip_download': True, # requires ffmpeg + } }] def _real_extract(self, url): @@ -66,13 +71,19 @@ class DiscoveryIE(InfoExtractor): entries = [] for idx, video_info in enumerate(info['playlist']): - formats = self._extract_m3u8_formats( - video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls', - note='Download m3u8 information for video %d' % (idx + 1)) - self._sort_formats(formats) + subtitles = {} + caption_url = video_info.get('captionsUrl') + if caption_url: + subtitles = { + 'en': [{ + 'url': caption_url, + }] + } + entries.append({ + '_type': 'url_transparent', + 'url': 'http://players.brightcove.net/103207/default_default/index.html?videoId=ref:%s' % video_info['referenceId'], 'id': compat_str(video_info['id']), - 'formats': formats, 'title': video_info['title'], 'description': video_info.get('description'), 'duration': parse_duration(video_info.get('video_length')), @@ -80,6 +91,7 @@ class DiscoveryIE(InfoExtractor): 'thumbnail': video_info.get('thumbnailURL'), 'alt_title': video_info.get('secondary_title'), 'timestamp': parse_iso8601(video_info.get('publishedDate')), + 'subtitles': subtitles, }) return self.playlist_result(entries, display_id, video_title) diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py new file mode 100644 index 000000000..a78cb8a2a --- /dev/null +++ b/youtube_dl/extractor/dispeak.py @@ -0,0 +1,114 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + remove_end, + xpath_element, + xpath_text, +) + + +class DigitallySpeakingIE(InfoExtractor): + _VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml' + + _TESTS = [{ + # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface + 'url': 'http://evt.dispeak.com/ubm/gdc/sf16/xml/840376_BQRC.xml', + 'md5': 'a8efb6c31ed06ca8739294960b2dbabd', + 'info_dict': { + 'id': '840376_BQRC', + 'ext': 'mp4', + 'title': 'Tenacious Design and The Interface of \'Destiny\'', + }, + }, { + # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC + 'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml', + 'only_matching': True, + }] + + def _parse_mp4(self, metadata): + video_formats = [] + video_root = None + + mp4_video = xpath_text(metadata, './mp4video', default=None) + if mp4_video is not None: + mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video) + video_root = mobj.group('root') + if video_root is None: + http_host = xpath_text(metadata, 'httpHost', default=None) + if http_host: + video_root = 'http://%s/' % http_host + if video_root is None: + # Hard-coded in http://evt.dispeak.com/ubm/gdc/sf16/custom/player2.js + # Works for GPUTechConf, too + video_root = 'http://s3-2u.digitallyspeaking.com/' + + formats = metadata.findall('./MBRVideos/MBRVideo') + if not formats: + return None + for a_format in formats: + stream_name = xpath_text(a_format, 'streamName', fatal=True) + video_path = re.match(r'mp4\:(?P<path>.*)', stream_name).group('path') + url = video_root + video_path + vbr = xpath_text(a_format, 'bitrate') + video_formats.append({ + 'url': url, + 'vbr': int_or_none(vbr), + }) + return video_formats + + def _parse_flv(self, metadata): + formats = [] + akamai_url = xpath_text(metadata, './akamaiHost', fatal=True) + audios = metadata.findall('./audios/audio') + for audio in audios: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(audio.get('url'), '.flv'), + 'ext': 'flv', + 'vcodec': 'none', + 'format_id': audio.get('code'), + }) + slide_video_path = xpath_text(metadata, './slideVideo', fatal=True) + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(slide_video_path, '.flv'), + 'ext': 'flv', + 'format_note': 'slide deck video', + 'quality': -2, + 'preference': -2, + 'format_id': 'slides', + }) + speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True) + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(speaker_video_path, '.flv'), + 'ext': 'flv', + 'format_note': 'speaker video', + 'quality': -1, + 'preference': -1, + 'format_id': 'speaker', + }) + return formats + + def _real_extract(self, url): + video_id = self._match_id(url) + + xml_description = self._download_xml(url, video_id) + metadata = xpath_element(xml_description, 'metadata') + + video_formats = self._parse_mp4(metadata) + if video_formats is None: + video_formats = self._parse_flv(metadata) + + return { + 'id': video_id, + 'formats': video_formats, + 'title': xpath_text(metadata, 'title', fatal=True), + 'duration': parse_duration(xpath_text(metadata, 'endTime')), + 'creator': xpath_text(metadata, 'speaker'), + } diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 3915cb182..ce6962755 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor): 'display_id': 'iseven', 'ext': 'flv', 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:f34981259a03e980a3c6404190a3ed61', + 'description': 're:.*m7show@163\.com.*', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', 'uploader_id': '431925', @@ -43,7 +43,7 @@ class DouyuTVIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'skip': 'Romm not found', + 'skip': 'Room not found', }, { 'url': 'http://www.douyutv.com/17732', 'info_dict': { @@ -51,7 +51,7 @@ class DouyuTVIE(InfoExtractor): 'display_id': '17732', 'ext': 'flv', 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:f34981259a03e980a3c6404190a3ed61', + 'description': 're:.*m7show@163\.com.*', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', 'uploader_id': '431925', @@ -75,13 +75,28 @@ class DouyuTVIE(InfoExtractor): room_id = self._html_search_regex( r'"room_id"\s*:\s*(\d+),', page, 'room id') - prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( - room_id, int(time.time())) + config = None + # Douyu API sometimes returns error "Unable to load the requested class: eticket_redis_cache" + # Retry with different parameters - same parameters cause same errors + for i in range(5): + prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( + room_id, int(time.time())) + auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest() - auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest() - config = self._download_json( - 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), - video_id) + config_page = self._download_webpage( + 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), + video_id) + try: + config = self._parse_json(config_page, video_id, fatal=False) + except ExtractorError: + # Wait some time before retrying to get a different time() value + self._sleep(1, video_id, msg_template='%(video_id)s: Error occurs. ' + 'Waiting for %(timeout)s seconds before retrying') + continue + else: + break + if config is None: + raise ExtractorError('Unable to fetch API result') data = config['data'] diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 66bbfc6ca..5790553f3 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -6,13 +6,18 @@ import re import time from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + update_url_query, +) class DPlayIE(InfoExtractor): _VALID_URL = r'https?://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)' _TESTS = [{ + # geo restricted, via direct unsigned hls URL 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', 'info_dict': { 'id': '1255600', @@ -31,11 +36,12 @@ class DPlayIE(InfoExtractor): }, 'expected_warnings': ['Unable to download f4m manifest'], }, { + # non geo restricted, via secure api, unsigned download hls URL 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', 'info_dict': { 'id': '3172', 'display_id': 'season-1-svensken-lar-sig-njuta-av-livet', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Svensken lär sig njuta av livet', 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8', 'duration': 2650, @@ -48,23 +54,25 @@ class DPlayIE(InfoExtractor): 'age_limit': 0, }, }, { + # geo restricted, via secure api, unsigned download hls URL 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/', 'info_dict': { 'id': '70816', 'display_id': 'season-6-episode-12', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Episode 12', 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90', 'duration': 2563, 'timestamp': 1429696800, 'upload_date': '20150422', - 'creator': 'Kanal 4', + 'creator': 'Kanal 4 (Home)', 'series': 'Mig og min mor', 'season_number': 6, 'episode_number': 12, 'age_limit': 0, }, }, { + # geo restricted, via direct unsigned hls URL 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/', 'only_matching': True, }] @@ -90,17 +98,24 @@ class DPlayIE(InfoExtractor): def extract_formats(protocol, manifest_url): if protocol == 'hls': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( manifest_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False)) + entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False) + # Sometimes final URLs inside m3u8 are unsigned, let's fix this + # ourselves + query = compat_urlparse.parse_qs(compat_urlparse.urlparse(manifest_url).query) + for m3u8_format in m3u8_formats: + m3u8_format['url'] = update_url_query(m3u8_format['url'], query) + formats.extend(m3u8_formats) elif protocol == 'hds': formats.extend(self._extract_f4m_formats( manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0', video_id, f4m_id=protocol, fatal=False)) domain_tld = domain.split('.')[-1] - if domain_tld in ('se', 'dk'): + if domain_tld in ('se', 'dk', 'no'): for protocol in PROTOCOLS: + # Providing dsc-geo allows to bypass geo restriction in some cases self._set_cookie( 'secure.dplay.%s' % domain_tld, 'dsc-geo', json.dumps({ @@ -113,13 +128,24 @@ class DPlayIE(InfoExtractor): 'Downloading %s stream JSON' % protocol, fatal=False) if stream and stream.get(protocol): extract_formats(protocol, stream[protocol]) - else: + + # The last resort is to try direct unsigned hls/hds URLs from info dictionary. + # Sometimes this does work even when secure API with dsc-geo has failed (e.g. + # http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/). + if not formats: for protocol in PROTOCOLS: if info.get(protocol): extract_formats(protocol, info[protocol]) self._sort_formats(formats) + subtitles = {} + for lang in ('se', 'sv', 'da', 'nl', 'no'): + for format_id in ('web_vtt', 'vtt', 'srt'): + subtitle_url = info.get('subtitles_%s_%s' % (lang, format_id)) + if subtitle_url: + subtitles.setdefault(lang, []).append({'url': subtitle_url}) + return { 'id': video_id, 'display_id': display_id, @@ -133,4 +159,5 @@ class DPlayIE(InfoExtractor): 'episode_number': int_or_none(info.get('episode')), 'age_limit': int_or_none(info.get('minimum_age')), 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/dump.py b/youtube_dl/extractor/dump.py deleted file mode 100644 index ff78d4fd2..000000000 --- a/youtube_dl/extractor/dump.py +++ /dev/null @@ -1,39 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class DumpIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?dump\.com/(?P<id>[a-zA-Z0-9]+)/' - - _TEST = { - 'url': 'http://www.dump.com/oneus/', - 'md5': 'ad71704d1e67dfd9e81e3e8b42d69d99', - 'info_dict': { - 'id': 'oneus', - 'ext': 'flv', - 'title': "He's one of us.", - 'thumbnail': 're:^https?://.*\.jpg$', - }, - } - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') - - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r's1.addVariable\("file",\s*"([^"]+)"', webpage, 'video URL') - - title = self._og_search_title(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 7bbf617d4..113a4966f 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -4,9 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + url_basename, ) @@ -21,7 +23,7 @@ class EaglePlatformIE(InfoExtractor): _TESTS = [{ # http://lenta.ru/news/2015/03/06/navalny/ 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', - 'md5': '70f5187fb620f2c1d503b3b22fd4efe3', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '227304', 'ext': 'mp4', @@ -36,7 +38,7 @@ class EaglePlatformIE(InfoExtractor): # http://muz-tv.ru/play/7129/ # http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true 'url': 'eagleplatform:media.clipyou.ru:12820', - 'md5': '90b26344ba442c8e44aa4cf8f301164a', + 'md5': '358597369cf8ba56675c1df15e7af624', 'info_dict': { 'id': '12820', 'ext': 'mp4', @@ -55,8 +57,13 @@ class EaglePlatformIE(InfoExtractor): raise ExtractorError(' '.join(response['errors']), expected=True) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): - response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) - self._handle_error(response) + try: + response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError): + response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) + self._handle_error(response) + raise return response def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'): @@ -84,17 +91,33 @@ class EaglePlatformIE(InfoExtractor): secure_m3u8 = self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:') + formats = [] + m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON') - formats = self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + formats.extend(m3u8_formats) mp4_url = self._get_video_url( # Secure mp4 URL is constructed according to Player.prototype.mp4 from # http://lentaru.media.eagleplatform.com/player/player.js re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4', secure_m3u8), video_id, 'Downloading mp4 JSON') - formats.append({'url': mp4_url, 'format_id': 'mp4'}) + mp4_url_basename = url_basename(mp4_url) + for m3u8_format in m3u8_formats: + mobj = re.search('/([^/]+)/index\.m3u8', m3u8_format['url']) + if mobj: + http_format = m3u8_format.copy() + video_url = mp4_url.replace(mp4_url_basename, mobj.group(1)) + if not self._is_valid_url(video_url, video_id): + continue + http_format.update({ + 'url': video_url, + 'format_id': m3u8_format['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(http_format) self._sort_formats(formats) diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py index b6bfd2b2d..c97682cd3 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/youtube_dl/extractor/ebaumsworld.py @@ -4,10 +4,10 @@ from .common import InfoExtractor class EbaumsWorldIE(InfoExtractor): - _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?ebaumsworld\.com/videos/[^/]+/(?P<id>\d+)' _TEST = { - 'url': 'http://www.ebaumsworld.com/video/watch/83367677/', + 'url': 'http://www.ebaumsworld.com/videos/a-giant-python-opens-the-door/83367677/', 'info_dict': { 'id': '83367677', 'ext': 'mp4', diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5fb5acb4b..98fe46153 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -46,6 +46,7 @@ from .arte import ( ArteTVPlus7IE, ArteTVCreativeIE, ArteTVConcertIE, + ArteTVInfoIE, ArteTVFutureIE, ArteTVCinemaIE, ArteTVDDCIE, @@ -74,6 +75,7 @@ from .bigflix import BigflixIE from .bild import BildIE from .bilibili import BiliBiliIE from .biobiochiletv import BioBioChileTVIE +from .biqle import BIQLEIE from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, @@ -122,7 +124,7 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE -from .cinemassacre import CinemassacreIE +from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE @@ -155,6 +157,7 @@ from .cspan import CSpanIE from .ctsnews import CtsNewsIE from .cultureunplugged import CultureUnpluggedIE from .cwtv import CWTVIE +from .dailymail import DailyMailIE from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, @@ -191,10 +194,10 @@ from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE from .drtv import DRTVIE from .dvtv import DVTVIE -from .dump import DumpIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE +from .dispeak import DigitallySpeakingIE from .dropbox import DropboxIE from .dw import ( DWIE, @@ -335,7 +338,6 @@ from .ivi import ( ) from .ivideon import IvideonIE from .izlesene import IzleseneIE -from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .jwplatform import JWPlatformIE @@ -381,6 +383,7 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) +from .litv import LiTVIE from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, @@ -399,19 +402,28 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .makerschannel import MakersChannelIE from .makertv import MakerTVIE -from .malemotion import MalemotionIE from .matchtv import MatchTVIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE +from .mgtv import MGTVIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE -from .mixcloud import MixcloudIE +from .mixcloud import ( + MixcloudIE, + MixcloudUserIE, + MixcloudPlaylistIE, + MixcloudStreamIE, +) from .mlb import MLBIE from .mnet import MnetIE from .mpora import MporaIE @@ -419,7 +431,6 @@ from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE from .moniker import MonikerIE -from .mooshare import MooshareIE from .morningstar import MorningstarIE from .motherless import MotherlessIE from .motorsport import MotorsportIE @@ -433,8 +444,7 @@ from .mtv import ( ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE -from .muzu import MuzuTVIE -from .mwave import MwaveIE +from .mwave import MwaveIE, MwaveMeetGreetIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvi import MyviIE @@ -464,7 +474,6 @@ from .ndr import ( from .ndtv import NDTVIE from .netzkino import NetzkinoIE from .nerdcubed import NerdCubedFeedIE -from .nerdist import NerdistIE from .neteasemusic import ( NetEaseMusicIE, NetEaseMusicAlbumIE, @@ -485,9 +494,10 @@ from .nextmovie import NextMovieIE from .nfb import NFBIE from .nfl import NFLIE from .nhl import ( - NHLIE, - NHLNewsIE, NHLVideocenterIE, + NHLNewsIE, + NHLVideocenterCategoryIE, + NHLIE, ) from .nick import NickIE from .niconico import NiconicoIE, NiconicoPlaylistIE @@ -555,12 +565,15 @@ from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE -from .periscope import PeriscopeIE +from .people import PeopleIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE from .pinkbike import PinkbikeIE -from .planetaplay import PlanetaPlayIE from .pladform import PladformIE from .played import PlayedIE from .playfm import PlayFMIE @@ -583,6 +596,7 @@ from .pornhub import ( from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE +from .presstv import PressTVIE from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE @@ -595,7 +609,6 @@ from .qqmusic import ( QQMusicToplistIE, QQMusicPlaylistIE, ) -from .quickvid import QuickVidIE from .r7 import R7IE from .radiode import RadioDeIE from .radiojavan import RadioJavanIE @@ -653,7 +666,6 @@ from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE -from .sexykarma import SexyKarmaIE from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE @@ -670,10 +682,6 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) -from .snagfilms import ( - SnagFilmsIE, - SnagFilmsEmbedIE, -) from .snotr import SnotrIE from .sohu import SohuIE from .soundcloud import ( @@ -725,9 +733,13 @@ from .svt import ( from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE -from .tagesschau import TagesschauIE +from .tagesschau import ( + TagesschauPlayerIE, + TagesschauIE, +) from .tapely import TapelyIE from .tass import TassIE +from .tdslifeway import TDSLifewayIE from .teachertube import ( TeacherTubeIE, TeacherTubeUserIE, @@ -745,7 +757,6 @@ from .teletask import TeleTaskIE from .testurl import TestURLIE from .tf1 import TF1IE from .theintercept import TheInterceptIE -from .theonion import TheOnionIE from .theplatform import ( ThePlatformIE, ThePlatformFeedIE, @@ -823,7 +834,6 @@ from .twitch import ( TwitchVodIE, TwitchProfileIE, TwitchPastBroadcastsIE, - TwitchBookmarksIE, TwitchStreamIE, ) from .twitter import ( @@ -831,7 +841,6 @@ from .twitter import ( TwitterIE, TwitterAmplifyIE, ) -from .ubu import UbuIE from .udemy import ( UdemyIE, UdemyCourseIE @@ -842,14 +851,20 @@ from .unistra import UnistraIE from .urort import UrortIE from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import UstudioIE +from .ustudio import ( + UstudioIE, + UstudioEmbedIE, +) from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE from .veoh import VeohIE from .vessel import VesselIE from .vesti import VestiIE -from .vevo import VevoIE +from .vevo import ( + VevoIE, + VevoPlaylistIE, +) from .vgtv import ( BTArticleIE, BTVestlendingenIE, @@ -878,6 +893,10 @@ from .vidme import ( ) from .vidzi import VidziIE from .vier import VierIE, VierVideosIE +from .viewlift import ( + ViewLiftIE, + ViewLiftEmbedIE, +) from .viewster import ViewsterIE from .viidea import ViideaIE from .vimeo import ( @@ -916,7 +935,7 @@ from .vulture import VultureIE from .walla import WallaIE from .washingtonpost import WashingtonPostIE from .wat import WatIE -from .wayofthemaster import WayOfTheMasterIE +from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, WDRMobileIE, @@ -940,6 +959,12 @@ from .xhamster import ( XHamsterIE, XHamsterEmbedIE, ) +from .xiami import ( + XiamiSongIE, + XiamiAlbumIE, + XiamiArtistIE, + XiamiCollectionIE +) from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py index f1f150ef2..8d1010b88 100644 --- a/youtube_dl/extractor/fczenit.py +++ b/youtube_dl/extractor/fczenit.py @@ -1,20 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urlparse class FczenitIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)' _TEST = { - 'url': 'http://fc-zenit.ru/video/gl6785/', - 'md5': '458bacc24549173fe5a5aa29174a5606', + 'url': 'http://fc-zenit.ru/video/41044/', + 'md5': '0e3fab421b455e970fa1aa3891e57df0', 'info_dict': { - 'id': '6785', + 'id': '41044', 'ext': 'mp4', - 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', + 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»', }, } @@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title') + video_title = self._html_search_regex( + r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title') - bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') - bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) + video_items = self._parse_json(self._search_regex( + r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'), + video_id) + + def merge_dicts(*dicts): + ret = {} + for a_dict in dicts: + ret.update(a_dict) + return ret formats = [{ - 'url': furl, - 'tbr': tbr, - } for furl, tbr in bitrates] + 'url': compat_urlparse.urljoin(url, video_url), + 'tbr': int(tbr), + } for tbr, video_url in merge_dicts(*video_items).items()] self._sort_formats(formats) diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 98b165143..88bca1007 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -2,78 +2,133 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_xpath +from ..utils import ( + int_or_none, + qualities, + unified_strdate, + xpath_attr, + xpath_element, + xpath_text, + xpath_with_ns, +) class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.1tv.ru/videoarchive/73390', - 'md5': '777f525feeec4806130f4f764bc18a4f', - 'info_dict': { - 'id': '73390', - 'ext': 'mp4', - 'title': 'Олимпийские канатные дороги', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'duration': 149, - 'like_count': int, - 'dislike_count': int, - }, - 'skip': 'Only works from Russia', - }, { + # single format via video_materials.json API 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', - 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', + 'md5': '82a2777648acae812d58b3f5bd42882b', 'info_dict': { 'id': '35930', 'ext': 'mp4', - 'title': 'Наедине со всеми. Людмила Сенчина', - 'description': 'md5:89553aed1d641416001fe8d450f06cb9', + 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', + 'description': 'md5:357933adeede13b202c7c21f91b871b2', 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20150212', 'duration': 2694, }, - 'skip': 'Only works from Russia', + }, { + # multiple formats via video_materials.json API + 'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641', + 'info_dict': { + 'id': '113641', + 'ext': 'mp4', + 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', + 'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20160407', + 'duration': 179, + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, { + # single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API + 'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038', + 'md5': '519d306c5b5669761fd8906c39dbee23', + 'info_dict': { + 'id': '47038', + 'ext': 'mp4', + 'title': '"Побег". Второй сезон. 3 серия', + 'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20120516', + 'duration': 3080, + }, + }, { + 'url': 'http://www.1tv.ru/videoarchive/9967', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, 'Downloading page') + # Videos with multiple formats only available via this API + video = self._download_json( + 'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id, + video_id, fatal=False) - video_url = self._html_search_regex( - r'''(?s)(?:jwplayer\('flashvideoportal_1'\)\.setup\({|var\s+playlistObj\s*=).*?'file'\s*:\s*'([^']+)'.*?}\);''', - webpage, 'video URL') + description, thumbnail, upload_date, duration = [None] * 4 - title = self._html_search_regex( - [r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', - r"'title'\s*:\s*'([^']+)'"], webpage, 'title') - description = self._html_search_regex( - r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', - webpage, 'description', default=None) or self._html_search_meta( + if video: + item = video[0] + title = item['title'] + quality = qualities(('ld', 'sd', 'hd', )) + formats = [{ + 'url': f['src'], + 'format_id': f.get('name'), + 'quality': quality(f.get('name')), + } for f in item['mbr'] if f.get('src')] + thumbnail = item.get('poster') + else: + # Some videos are not available via video_materials.json + video = self._download_xml( + 'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id, + video_id) + + NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + } + + item = xpath_element(video, './channel/item', fatal=True) + title = xpath_text(item, './title', fatal=True) + formats = [{ + 'url': content.attrib['url'], + } for content in item.findall( + compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')] + thumbnail = xpath_attr( + item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url') + + self._sort_formats(formats) + + webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False) + if webpage: + title = self._html_search_regex( + (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', + r"'title'\s*:\s*'([^']+)'"), + webpage, 'title', default=None) or title + description = self._html_search_regex( + r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', + webpage, 'description', default=None) or self._html_search_meta( 'description', webpage, 'description') - - thumbnail = self._og_search_thumbnail(webpage) - duration = self._og_search_property( - 'video:duration', webpage, - 'video duration', fatal=False) - - like_count = self._html_search_regex( - r'title="Понравилось".*?/></label> \[(\d+)\]', - webpage, 'like count', default=None) - dislike_count = self._html_search_regex( - r'title="Не понравилось".*?/></label> \[(\d+)\]', - webpage, 'dislike count', default=None) + thumbnail = thumbnail or self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'video duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) return { 'id': video_id, - 'url': video_url, 'thumbnail': thumbnail, 'title': title, 'description': description, + 'upload_date': upload_date, 'duration': int_or_none(duration), - 'like_count': int_or_none(like_count), - 'dislike_count': int_or_none(dislike_count), + 'formats': formats } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 0a3de1498..a8e1bf42a 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -24,13 +24,28 @@ class FlickrIE(InfoExtractor): 'upload_date': '20110423', 'uploader_id': '10922353@N03', 'uploader': 'Forest Wander', + 'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/', 'comment_count': int, 'view_count': int, 'tags': list, + 'license': 'Attribution-ShareAlike', } } - _API_BASE_URL = 'https://api.flickr.com/services/rest?' + # https://help.yahoo.com/kb/flickr/SLN25525.html + _LICENSES = { + '0': 'All Rights Reserved', + '1': 'Attribution-NonCommercial-ShareAlike', + '2': 'Attribution-NonCommercial', + '3': 'Attribution-NonCommercial-NoDerivs', + '4': 'Attribution', + '5': 'Attribution-ShareAlike', + '6': 'Attribution-NoDerivs', + '7': 'No known copyright restrictions', + '8': 'United States government work', + '9': 'Public Domain Dedication (CC0)', + '10': 'Public Domain Work', + } def _call_api(self, method, video_id, api_key, note, secret=None): query = { @@ -75,6 +90,9 @@ class FlickrIE(InfoExtractor): self._sort_formats(formats) owner = video_info.get('owner', {}) + uploader_id = owner.get('nsid') + uploader_path = owner.get('path_alias') or uploader_id + uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None return { 'id': video_id, @@ -83,11 +101,13 @@ class FlickrIE(InfoExtractor): 'formats': formats, 'timestamp': int_or_none(video_info.get('dateuploaded')), 'duration': int_or_none(video_info.get('video', {}).get('duration')), - 'uploader_id': owner.get('nsid'), + 'uploader_id': uploader_id, 'uploader': owner.get('realname'), + 'uploader_url': uploader_url, 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), 'view_count': int_or_none(video_info.get('views')), - 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] + 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])], + 'license': self._LICENSES.get(video_info.get('license')), } else: raise ExtractorError('not a video', expected=True) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 1eb528f31..0ad0d9b6a 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse_unquote_plus, +) from ..utils import ( clean_html, determine_ext, @@ -27,6 +31,7 @@ class FunimationIE(InfoExtractor): 'description': 'md5:1769f43cd5fc130ace8fd87232207892', 'thumbnail': 're:https?://.*\.jpg', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed', }, { 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play', 'info_dict': { @@ -37,6 +42,7 @@ class FunimationIE(InfoExtractor): 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', 'thumbnail': 're:https?://.*\.jpg', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare', }, { 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview', 'info_dict': { @@ -47,8 +53,36 @@ class FunimationIE(InfoExtractor): 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', 'thumbnail': 're:https?://.*\.(?:jpg|png)', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare', }] + _LOGIN_URL = 'http://www.funimation.com/login' + + def _download_webpage(self, *args, **kwargs): + try: + return super(FunimationIE, self)._download_webpage(*args, **kwargs) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + response = ee.cause.read() + if b'>Please complete the security check to access<' in response: + raise ExtractorError( + 'Access to funimation.com is blocked by CloudFlare. ' + 'Please browse to http://www.funimation.com/, solve ' + 'the reCAPTCHA, export browser cookies to a text file,' + ' and then try again with --cookies YOUR_COOKIE_FILE.', + expected=True) + raise + + def _extract_cloudflare_session_ua(self, url): + ci_session_cookie = self._get_cookies(url).get('ci_session') + if ci_session_cookie: + ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value) + # ci_session is a string serialized by PHP function serialize() + # This case is simple enough to use regular expressions only + return self._search_regex( + r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent', + default=None) + def _login(self): (username, password) = self._get_login_info() if username is None: @@ -57,8 +91,11 @@ class FunimationIE(InfoExtractor): 'email_field': username, 'password_field': password, }) - login_request = sanitized_Request('http://www.funimation.com/login', data, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', + user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL) + if not user_agent: + user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0' + login_request = sanitized_Request(self._LOGIN_URL, data, headers={ + 'User-Agent': user_agent, 'Content-Type': 'application/x-www-form-urlencoded' }) login_page = self._download_webpage( @@ -103,11 +140,16 @@ class FunimationIE(InfoExtractor): ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'), ) + user_agent = self._extract_cloudflare_session_ua(url) + if user_agent: + USER_AGENTS = ((None, user_agent),) + for kind, user_agent in USER_AGENTS: request = sanitized_Request(url) request.add_header('User-Agent', user_agent) webpage = self._download_webpage( - request, display_id, 'Downloading %s webpage' % kind) + request, display_id, + 'Downloading %s webpage' % kind if kind else 'Downloading webpage') playlist = self._parse_json( self._search_regex( diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 4c4a87e2a..8c5ffc9e8 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -46,8 +46,8 @@ class FunnyOrDieIE(InfoExtractor): links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0) m3u8_url = self._search_regex( - r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8)\1', - webpage, 'm3u8 url', default=None, group='url') + r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8[^"\']*)\1', + webpage, 'm3u8 url', group='url') formats = [] diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py index ea32b621c..18ef5c252 100644 --- a/youtube_dl/extractor/gazeta.py +++ b/youtube_dl/extractor/gazeta.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class GazetaIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' + _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:main/)*(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' _TESTS = [{ 'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', 'md5': 'd49c9bdc6e5a7888f27475dc215ee789', @@ -18,9 +18,19 @@ class GazetaIE(InfoExtractor): 'description': 'md5:38617526050bd17b234728e7f9620a71', 'thumbnail': 're:^https?://.*\.jpg', }, + 'skip': 'video not found', }, { 'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml', 'only_matching': True, + }, { + 'url': 'http://www.gazeta.ru/video/main/main/2015/06/22/platit_ili_ne_platit_po_isku_yukosa.shtml', + 'md5': '37f19f78355eb2f4256ee1688359f24c', + 'info_dict': { + 'id': '252048', + 'ext': 'mp4', + 'title': '"Если по иску ЮКОСа придется платить, это будет большой удар по бюджету"', + }, + 'add_ie': ['EaglePlatform'], }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 25e93c9a4..3136427db 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..utils import ( - remove_end, HEADRequest, sanitized_Request, urlencode_postdata, @@ -51,63 +50,33 @@ class GDCVaultIE(InfoExtractor): { 'url': 'http://gdcvault.com/play/1020791/', 'only_matching': True, - } + }, + { + # Hard-coded hostname + 'url': 'http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface', + 'md5': 'a8efb6c31ed06ca8739294960b2dbabd', + 'info_dict': { + 'id': '1023460', + 'ext': 'mp4', + 'display_id': 'Tenacious-Design-and-The-Interface', + 'title': 'Tenacious Design and The Interface of \'Destiny\'', + }, + }, + { + # Multiple audios + 'url': 'http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC', + 'info_dict': { + 'id': '1014631', + 'ext': 'flv', + 'title': 'How to Create a Good Game - From My Experience of Designing Pac-Man', + }, + 'params': { + 'skip_download': True, # Requires rtmpdump + 'format': 'jp', # The japanese audio + } + }, ] - def _parse_mp4(self, xml_description): - video_formats = [] - mp4_video = xml_description.find('./metadata/mp4video') - if mp4_video is None: - return None - - mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text) - video_root = mobj.group('root') - formats = xml_description.findall('./metadata/MBRVideos/MBRVideo') - for format in formats: - mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text) - url = video_root + mobj.group('path') - vbr = format.find('bitrate').text - video_formats.append({ - 'url': url, - 'vbr': int(vbr), - }) - return video_formats - - def _parse_flv(self, xml_description): - formats = [] - akamai_url = xml_description.find('./metadata/akamaiHost').text - audios = xml_description.find('./metadata/audios') - if audios is not None: - for audio in audios: - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(audio.get('url'), '.flv'), - 'ext': 'flv', - 'vcodec': 'none', - 'format_id': audio.get('code'), - }) - slide_video_path = xml_description.find('./metadata/slideVideo').text - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xml_description.find('./metadata/speakerVideo').text - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', - }) - return formats - def _login(self, webpage_url, display_id): (username, password) = self._get_login_info() if username is None or password is None: @@ -183,17 +152,10 @@ class GDCVaultIE(InfoExtractor): r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename') - xml_description = self._download_xml( - '%s/xml/%s' % (xml_root, xml_name), display_id) - - video_title = xml_description.find('./metadata/title').text - video_formats = self._parse_mp4(xml_description) - if video_formats is None: - video_formats = self._parse_flv(xml_description) - return { + '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'title': video_title, - 'formats': video_formats, + 'url': '%s/xml/%s' % (xml_root, xml_name), + 'ie_key': 'DigitallySpeaking', } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 589d1e152..0f1eb7fa6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -51,7 +51,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE -from .snagfilms import SnagFilmsEmbedIE +from .viewlift import ViewLiftEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE from .pladform import PladformIE @@ -60,6 +60,7 @@ from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE from .instagram import InstagramIE +from .liveleak import LiveLeakIE class GenericIE(InfoExtractor): @@ -104,7 +105,8 @@ class GenericIE(InfoExtractor): 'skip_download': True, # infinite live stream }, 'expected_warnings': [ - r'501.*Not Implemented' + r'501.*Not Implemented', + r'400.*Bad Request', ], }, # Direct link with incorrect MIME type @@ -235,6 +237,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'car-20120827-manifest', 'formats': 'mincount:9', + 'upload_date': '20130904', }, 'params': { 'format': 'bestvideo', @@ -594,7 +597,11 @@ class GenericIE(InfoExtractor): 'id': 'k2mm4bCdJ6CQ2i7c8o2', 'ext': 'mp4', 'title': 'Le Zap de Spi0n n°216 - Zapping du Web', + 'description': 'md5:faf028e48a461b8b7fad38f1e104b119', 'uploader': 'Spi0n', + 'uploader_id': 'xgditw', + 'upload_date': '20140425', + 'timestamp': 1398441542, }, 'add_ie': ['Dailymotion'], }, @@ -727,8 +734,11 @@ class GenericIE(InfoExtractor): 'id': 'uxjb0lwrcz', 'ext': 'mp4', 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'description': 'a Martin Fowler video from ThoughtWorks', 'duration': 1715.0, 'uploader': 'thoughtworks.wistia.com', + 'upload_date': '20140603', + 'timestamp': 1401832161, }, }, # Soundcloud embed @@ -877,6 +887,7 @@ class GenericIE(InfoExtractor): # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '227304', 'ext': 'mp4', @@ -891,6 +902,7 @@ class GenericIE(InfoExtractor): # ClipYou (Eagle.Platform) embed (custom URL) { 'url': 'http://muz-tv.ru/play/7129/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '12820', 'ext': 'mp4', @@ -979,6 +991,9 @@ class GenericIE(InfoExtractor): 'ext': 'flv', 'title': "PFT Live: New leader in the 'new-look' defense", 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', + 'uploader': 'NBCU-SPORTS', + 'upload_date': '20140107', + 'timestamp': 1389118457, }, }, # UDN embed @@ -1031,6 +1046,9 @@ class GenericIE(InfoExtractor): 'title': 'SN Presents: Russell Martin, World Citizen', 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', 'uploader': 'Rogers Sportsnet', + 'uploader_id': '1704050871', + 'upload_date': '20150525', + 'timestamp': 1432570283, }, }, # Dailymotion Cloud video @@ -1122,12 +1140,39 @@ class GenericIE(InfoExtractor): 'title': 'The Cardinal Pell Interview', 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', 'uploader': 'GlobeCast Australia - GlobeStream', + 'uploader_id': '2733773828001', + 'upload_date': '20160304', + 'timestamp': 1457083087, }, 'params': { # m3u8 downloads 'skip_download': True, }, }, + # Another form of arte.tv embed + { + 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', + 'md5': '850bfe45417ddf221288c88a0cffe2e2', + 'info_dict': { + 'id': '030273-562_PLUS7-F', + 'ext': 'mp4', + 'title': 'ARTE Reportage - Nulle part, en France', + 'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d', + 'upload_date': '20160409', + }, + }, + # LiveLeak embed + { + 'url': 'http://www.wykop.pl/link/3088787/', + 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', + 'info_dict': { + 'id': '874_1459135191', + 'ext': 'mp4', + 'title': 'Man shows poor quality of new apartment building', + 'description': 'The wall is like a sand pile.', + 'uploader': 'Lake8737', + } + }, ] def report_following_redirect(self, new_url): @@ -1702,7 +1747,7 @@ class GenericIE(InfoExtractor): # Look for embedded arte.tv player mobj = re.search( - r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"', + r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'ArteTVEmbed') @@ -1879,10 +1924,10 @@ class GenericIE(InfoExtractor): if onionstudios_url: return self.url_result(onionstudios_url) - # Look for SnagFilms embeds - snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage) - if snagfilms_url: - return self.url_result(snagfilms_url) + # Look for ViewLift embeds + viewlift_url = ViewLiftEmbedIE._extract_url(webpage) + if viewlift_url: + return self.url_result(viewlift_url) # Look for JWPlatform embeds jwplatform_url = JWPlatformIE._extract_url(webpage) @@ -1930,7 +1975,13 @@ class GenericIE(InfoExtractor): # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) if instagram_embed_url is not None: - return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + return self.url_result( + self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) + + # Look for LiveLeak embeds + liveleak_url = LiveLeakIE._extract_url(webpage) + if liveleak_url: + return self.url_result(liveleak_url, 'LiveLeak') def check_video(vurl): if YoutubeIE.suitable(vurl): @@ -2013,6 +2064,7 @@ class GenericIE(InfoExtractor): entries = [] for video_url in found: + video_url = unescapeHTML(video_url) video_url = video_url.replace('\\/', '/') video_url = compat_urlparse.urljoin(url, video_url) video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 9561ed5fb..62ff84835 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import unified_strdate class GlideIE(InfoExtractor): @@ -15,26 +16,38 @@ class GlideIE(InfoExtractor): 'ext': 'mp4', 'title': 'Damon Timm\'s Glide message', 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', + 'uploader': 'Damon Timm', + 'upload_date': '20140919', } } def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex( - r'<title>(.*?)', webpage, 'title') - video_url = self.http_scheme() + self._search_regex( - r'', webpage, 'video URL') - thumbnail_url = self._search_regex( - r'(.+?)', webpage, 'title') + video_url = self._proto_relative_url(self._search_regex( + r']+src=(["\'])(?P.+?)\1', + webpage, 'video URL', default=None, + group='url')) or self._og_search_video_url(webpage) + thumbnail = self._proto_relative_url(self._search_regex( + r']+id=["\']video-thumbnail["\'][^>]+src=(["\'])(?P.+?)\1', + webpage, 'thumbnail url', default=None, + group='url')) or self._og_search_thumbnail(webpage) + uploader = self._search_regex( + r']+class=["\']info-name["\'][^>]*>([^<]+)', + webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r']+class="info-date"[^>]*>([^<]+)', + webpage, 'upload date', fatal=False)) return { 'id': video_id, 'title': title, 'url': video_url, 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py index 1d9166455..0c015141f 100644 --- a/youtube_dl/extractor/goshgay.py +++ b/youtube_dl/extractor/goshgay.py @@ -14,13 +14,13 @@ class GoshgayIE(InfoExtractor): _VALID_URL = r'https?://www\.goshgay\.com/video(?P\d+?)($|/)' _TEST = { 'url': 'http://www.goshgay.com/video299069/diesel_sfw_xxx_video', - 'md5': '027fcc54459dff0feb0bc06a7aeda680', + 'md5': '4b6db9a0a333142eb9f15913142b0ed1', 'info_dict': { 'id': '299069', 'ext': 'flv', 'title': 'DIESEL SFW XXX Video', 'thumbnail': 're:^http://.*\.jpg$', - 'duration': 79, + 'duration': 80, 'age_limit': 18, } } @@ -47,5 +47,5 @@ class GoshgayIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail, 'duration': duration, - 'age_limit': self._family_friendly_search(webpage), + 'age_limit': 18, } diff --git a/youtube_dl/extractor/gputechconf.py b/youtube_dl/extractor/gputechconf.py index 145b55bf3..73dc62c49 100644 --- a/youtube_dl/extractor/gputechconf.py +++ b/youtube_dl/extractor/gputechconf.py @@ -2,12 +2,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - xpath_element, - xpath_text, - int_or_none, - parse_duration, -) class GPUTechConfIE(InfoExtractor): @@ -27,29 +21,15 @@ class GPUTechConfIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - root_path = self._search_regex(r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path', 'http://evt.dispeak.com/nvidia/events/gtc15/') - xml_file_id = self._search_regex(r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id') - - doc = self._download_xml('%sxml/%s.xml' % (root_path, xml_file_id), video_id) - - metadata = xpath_element(doc, 'metadata') - http_host = xpath_text(metadata, 'httpHost', 'http host', True) - mbr_videos = xpath_element(metadata, 'MBRVideos') - - formats = [] - for mbr_video in mbr_videos.findall('MBRVideo'): - stream_name = xpath_text(mbr_video, 'streamName') - if stream_name: - formats.append({ - 'url': 'http://%s/%s' % (http_host, stream_name.replace('mp4:', '')), - 'tbr': int_or_none(xpath_text(mbr_video, 'bitrate')), - }) - self._sort_formats(formats) + root_path = self._search_regex( + r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path', + default='http://evt.dispeak.com/nvidia/events/gtc15/') + xml_file_id = self._search_regex( + r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id') return { + '_type': 'url_transparent', 'id': video_id, - 'title': xpath_text(metadata, 'title'), - 'duration': parse_duration(xpath_text(metadata, 'endTime')), - 'creator': xpath_text(metadata, 'speaker'), - 'formats': formats, + 'url': '%sxml/%s.xml' % (root_path, xml_file_id), + 'ie_key': 'DigitallySpeaking', } diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py index 63c05b6a6..f6b69662b 100644 --- a/youtube_dl/extractor/groupon.py +++ b/youtube_dl/extractor/groupon.py @@ -16,14 +16,14 @@ class GrouponIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Bikram Yoga Huntington Beach | Orange County', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'duration': 44.961, }, }], 'params': { - 'skip_download': 'HLS', + 'skip_download': 'HDS', } } @@ -32,7 +32,7 @@ class GrouponIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) payload = self._parse_json(self._search_regex( - r'var\s+payload\s*=\s*(.*?);\n', webpage, 'payload'), playlist_id) + r'(?:var\s+|window\.)payload\s*=\s*(.*?);\n', webpage, 'payload'), playlist_id) videos = payload['carousel'].get('dealVideos', []) entries = [] for v in videos: diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index 76b74c51d..65ba2a48b 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -24,6 +24,7 @@ class HowStuffWorksIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 161, }, + 'skip': 'Video broken', }, { 'url': 'http://adventure.howstuffworks.com/7199-survival-zone-food-and-water-in-the-savanna-video.htm', diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index a38eae421..059073749 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, parse_duration, unified_strdate, ) @@ -29,7 +30,12 @@ class HuffPostIE(InfoExtractor): 'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ', 'duration': 1549, 'upload_date': '20140124', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404: Not Found'], } def _real_extract(self, url): @@ -45,7 +51,7 @@ class HuffPostIE(InfoExtractor): description = data.get('description') thumbnails = [] - for url in data['images'].values(): + for url in filter(None, data['images'].values()): m = re.match('.*-([0-9]+x[0-9]+)\.', url) if not m: continue @@ -54,13 +60,25 @@ class HuffPostIE(InfoExtractor): 'resolution': m.group(1), }) - formats = [{ - 'format': key, - 'format_id': key.replace('/', '.'), - 'ext': 'mp4', - 'url': url, - 'vcodec': 'none' if key.startswith('audio/') else None, - } for key, url in data.get('sources', {}).get('live', {}).items()] + formats = [] + sources = data.get('sources', {}) + live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items()) + for key, url in live_sources: + ext = determine_ext(url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False)) + else: + formats.append({ + 'format': key, + 'format_id': key.replace('/', '.'), + 'ext': 'mp4', + 'url': url, + 'vcodec': 'none' if key.startswith('audio/') else None, + }) if not formats and data.get('fivemin_id'): return self.url_result('5min:%s' % data['fivemin_id']) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 8bed8ccd0..3a2b7cec5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( + mimetype2ext, qualities, ) @@ -12,9 +12,9 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/[^/]+/vi(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', @@ -22,7 +22,10 @@ class ImdbIE(InfoExtractor): 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb', 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', } - } + }, { + 'url': 'http://www.imdb.com/video/_/vi2524815897', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -48,13 +51,27 @@ class ImdbIE(InfoExtractor): json_data = self._search_regex( r']+class="imdb-player-data"[^>]*?>(.*?)', format_page, 'json data', flags=re.DOTALL) - info = json.loads(json_data) - format_info = info['videoPlayerObject']['video'] - f_id = format_info['ffname'] + info = self._parse_json(json_data, video_id, fatal=False) + if not info: + continue + format_info = info.get('videoPlayerObject', {}).get('video', {}) + if not format_info: + continue + video_info_list = format_info.get('videoInfoList') + if not video_info_list or not isinstance(video_info_list, list): + continue + video_info = video_info_list[0] + if not video_info or not isinstance(video_info, dict): + continue + video_url = video_info.get('videoUrl') + if not video_url: + continue + format_id = format_info.get('ffname') formats.append({ - 'format_id': f_id, - 'url': format_info['videoInfoList'][0]['videoUrl'], - 'quality': quality(f_id), + 'format_id': format_id, + 'url': video_url, + 'ext': mimetype2ext(video_info.get('videoMimeType')), + 'quality': quality(format_id), }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 11bb58d8a..3cbe77ad8 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -12,7 +12,7 @@ from ..utils import ( class InstagramIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/p/(?P[^/?#&]+)' + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/p/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -38,10 +38,19 @@ class InstagramIE(InfoExtractor): }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, + }, { + 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', + 'only_matching': True, }] @staticmethod def _extract_embed_url(webpage): + mobj = re.search( + r']+src=(["\'])(?P(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', + webpage) + if mobj: + return mobj.group('url') + blockquote_el = get_element_by_attribute( 'class', 'instagram-media', webpage) if blockquote_el is None: @@ -53,7 +62,9 @@ class InstagramIE(InfoExtractor): return mobj.group('link') def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + url = mobj.group('url') webpage = self._download_webpage(url, video_id) uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"', diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index e60145b3d..45add007f 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,93 +1,91 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urlparse, - compat_urllib_parse_urlencode, ) from ..utils import ( - xpath_with_ns, + determine_ext, + int_or_none, + xpath_text, ) class InternetVideoArchiveIE(InfoExtractor): - _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' + _VALID_URL = r'https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?' _TEST = { - 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?customerid=69249&publishedid=194487&reporttag=vdbetatitle&playerid=641&autolist=0&domain=www.videodetective.com&maxrate=high&minrate=low&socialplayer=false', 'info_dict': { - 'id': '452693', + 'id': '194487', 'ext': 'mp4', - 'title': 'SKYFALL', - 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', - 'duration': 152, + 'title': 'KICK-ASS 2', + 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } @staticmethod - def _build_url(query): - return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + def _build_json_url(query): + return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query @staticmethod - def _clean_query(query): - NEEDED_ARGS = ['publishedid', 'customerid'] - query_dic = compat_urlparse.parse_qs(query) - cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS) - # Other player ids return m3u8 urls - cleaned_dic['playerid'] = '247' - cleaned_dic['videokbrate'] = '100000' - return compat_urllib_parse_urlencode(cleaned_dic) + def _build_xml_url(query): + return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query def _real_extract(self, url): query = compat_urlparse.urlparse(url).query - query_dic = compat_urlparse.parse_qs(query) + query_dic = compat_parse_qs(query) video_id = query_dic['publishedid'][0] - url = self._build_url(query) - flashconfiguration = self._download_xml(url, video_id, - 'Downloading flash configuration') - file_url = flashconfiguration.find('file').text - file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') - # Replace some of the parameters in the query to get the best quality - # and http links (no m3u8 manifests) - file_url = re.sub(r'(?<=\?)(.+)$', - lambda m: self._clean_query(m.group()), - file_url) - info = self._download_xml(file_url, video_id, - 'Downloading video info') - item = info.find('channel/item') + if '/player/' in url: + configuration = self._download_json(url, video_id) - def _bp(p): - return xpath_with_ns( - p, - { - 'media': 'http://search.yahoo.com/mrss/', - 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats', - } - ) - formats = [] - for content in item.findall(_bp('media:group/media:content')): - attr = content.attrib - f_url = attr['url'] - width = int(attr['width']) - bitrate = int(attr['bitrate']) - format_id = '%d-%dk' % (width, bitrate) - formats.append({ - 'format_id': format_id, - 'url': f_url, - 'width': width, - 'tbr': bitrate, - }) + # There are multiple videos in the playlist whlie only the first one + # matches the video played in browsers + video_info = configuration['playlist'][0] - self._sort_formats(formats) + formats = [] + for source in video_info['sources']: + file_url = source['file'] + if determine_ext(file_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, ext='mp4', m3u8_id='hls')) + else: + a_format = { + 'url': file_url, + } + + if source.get('label') and source['label'][-4:] == ' kbs': + tbr = int_or_none(source['label'][:-4]) + a_format.update({ + 'tbr': tbr, + 'format_id': 'http-%d' % tbr, + }) + formats.append(a_format) + + self._sort_formats(formats) + + title = video_info['title'] + description = video_info.get('description') + thumbnail = video_info.get('image') + else: + configuration = self._download_xml(url, video_id) + formats = [{ + 'url': xpath_text(configuration, './file', 'file URL', fatal=True), + }] + thumbnail = xpath_text(configuration, './image', 'thumbnail') + title = 'InternetVideoArchive video %s' % video_id + description = None return { 'id': video_id, - 'title': item.find('title').text, + 'title': title, 'formats': formats, - 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], - 'description': item.find('description').text, - 'duration': int(attr['duration']), + 'thumbnail': thumbnail, + 'description': description, } diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 88570f261..ddcb3c916 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -165,7 +165,7 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _VALID_URL = r'https?://(?:[^.]+\.)?iqiyi\.com/.+\.html' + _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html' _NETRC_MACHINE = 'iqiyi' @@ -273,6 +273,9 @@ class IqiyiIE(InfoExtractor): 'title': '灌篮高手 国语版', }, 'playlist_count': 101, + }, { + 'url': 'http://www.pps.tv/w_19rrbav0ph.html', + 'only_matching': True, }] _FORMATS_MAP = [ @@ -284,6 +287,13 @@ class IqiyiIE(InfoExtractor): ('10', 'h1'), ] + AUTH_API_ERRORS = { + # No preview available (不允许试看鉴权失败) + 'Q00505': 'This video requires a VIP account', + # End of preview time (试看结束鉴权失败) + 'Q00506': 'Needs a VIP account for full video', + } + def _real_initialize(self): self._login() @@ -369,14 +379,18 @@ class IqiyiIE(InfoExtractor): note='Downloading video authentication JSON', errnote='Unable to download video authentication JSON') - if auth_result['code'] == 'Q00505': # No preview available (不允许试看鉴权失败) - raise ExtractorError('This video requires a VIP account', expected=True) - if auth_result['code'] == 'Q00506': # End of preview time (试看结束鉴权失败) + code = auth_result.get('code') + msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code + if code == 'Q00506': if do_report_warning: - self.report_warning('Needs a VIP account for full video') + self.report_warning(msg) return False + if 'data' not in auth_result: + if msg is not None: + raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True) + raise ExtractorError('Unexpected error from Iqiyi auth API') - return auth_result + return auth_result['data'] def construct_video_urls(self, data, video_id, _uuid, tvid): def do_xor(x, y): @@ -452,11 +466,11 @@ class IqiyiIE(InfoExtractor): need_vip_warning_report = False break param.update({ - 't': auth_result['data']['t'], + 't': auth_result['t'], # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as 'cid': 'afbe8fd3d73448c9', 'vid': video_id, - 'QY00001': auth_result['data']['u'], + 'QY00001': auth_result['u'], }) api_video_url += '?' if '?' not in api_video_url else '&' api_video_url += compat_urllib_parse_urlencode(param) @@ -491,7 +505,10 @@ class IqiyiIE(InfoExtractor): 'enc': md5_text(enc_key + tail), 'qyid': _uuid, 'tn': random.random(), - 'um': 0, + # In iQiyi's flash player, um is set to 1 if there's a logged user + # Some 1080P formats are only available with a logged user. + # Here force um=1 to trick the iQiyi server + 'um': 1, 'authkey': md5_text(md5_text('') + tail), 'k_tag': 1, } diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index bc226fa67..aa0728abc 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -29,7 +29,7 @@ class IzleseneIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sevinçten Çıldırtan Doğum Günü Hediyesi', 'description': 'md5:253753e2655dde93f59f74b572454f6d', - 'thumbnail': 're:^http://.*\.jpg', + 'thumbnail': 're:^https?://.*\.jpg', 'uploader_id': 'pelikzzle', 'timestamp': int, 'upload_date': '20140702', @@ -44,8 +44,7 @@ class IzleseneIE(InfoExtractor): 'id': '17997', 'ext': 'mp4', 'title': 'Tarkan Dortmund 2006 Konseri', - 'description': 'Tarkan Dortmund 2006 Konseri', - 'thumbnail': 're:^http://.*\.jpg', + 'thumbnail': 're:^https://.*\.jpg', 'uploader_id': 'parlayankiz', 'timestamp': int, 'upload_date': '20061112', @@ -62,7 +61,7 @@ class IzleseneIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) - description = self._og_search_description(webpage) + description = self._og_search_description(webpage, default=None) thumbnail = self._proto_relative_url( self._og_search_thumbnail(webpage), scheme='http:') diff --git a/youtube_dl/extractor/jadorecettepub.py b/youtube_dl/extractor/jadorecettepub.py deleted file mode 100644 index 158c09a33..000000000 --- a/youtube_dl/extractor/jadorecettepub.py +++ /dev/null @@ -1,47 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .youtube import YoutubeIE - - -class JadoreCettePubIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P.*?)\.html' - - _TEST = { - 'url': 'http://www.jadorecettepub.com/2010/12/star-wars-massacre-par-les-japonais.html', - 'md5': '401286a06067c70b44076044b66515de', - 'info_dict': { - 'id': 'jLMja3tr7a4', - 'ext': 'mp4', - 'title': 'La pire utilisation de Star Wars', - 'description': "Jadorecettepub.com vous a gratifié de plusieurs pubs géniales utilisant Star Wars et Dark Vador plus particulièrement... Mais l'heure est venue de vous proposer une version totalement massacrée, venue du Japon. Quand les Japonais détruisent l'image de Star Wars pour vendre du thon en boite, ça promet...", - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_regex( - r'(.*?)', - webpage, 'title') - description = self._html_search_regex( - r'(?s)
(.*?)', + r'\s*
\s*]+>([^<]+)', webpage, 'JS code') + decoded = self.openload_decode(code) + video_url = self._search_regex( - r'return\s+"(https?://[^"]+)"', self.openload_decode(code), 'video URL') + r'return\s+"(https?://[^"]+)"', decoded, 'video URL') + + title = self._og_search_title(webpage, default=None) or self._search_regex( + r']+class=["\']title["\'][^>]*>([^<]+)', webpage, + 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', fatal=True) + + ext = mimetype2ext(self._search_regex( + r'window\.vt\s*=\s*(["\'])(?P.+?)\1', decoded, + 'mimetype', default=None, group='mimetype')) or determine_ext( + video_url, 'mp4') return { 'id': video_id, - 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'title': title, + 'ext': ext, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'url': video_url, } diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 66c75f8b3..4e3864f0d 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -185,6 +185,7 @@ class ORFFM4IE(InfoExtractor): 'timestamp': 1452456073, 'upload_date': '20160110', }, + 'skip': 'Live streams on FM4 got deleted soon', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f43e3a146..81918ac6e 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -196,7 +196,7 @@ class PBSIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', - 'md5': 'ce1888486f0908d555a8093cac9a7362', + 'md5': '173dc391afd361fa72eab5d3d918968d', 'info_dict': { 'id': '2365006249', 'ext': 'mp4', @@ -204,13 +204,10 @@ class PBSIE(InfoExtractor): 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', 'duration': 3190, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', - 'md5': '143c98aa54a346738a3d78f54c925321', + 'md5': '6f722cb3c3982186d34b0f13374499c7', 'info_dict': { 'id': '2365297690', 'ext': 'mp4', @@ -218,9 +215,6 @@ class PBSIE(InfoExtractor): 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', 'duration': 5050, }, - 'params': { - 'skip_download': True, # requires ffmpeg - } }, { 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', @@ -244,9 +238,6 @@ class PBSIE(InfoExtractor): 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', @@ -262,9 +253,6 @@ class PBSIE(InfoExtractor): 'upload_date': '20140122', 'age_limit': 10, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', @@ -290,6 +278,7 @@ class PBSIE(InfoExtractor): }, { 'url': 'http://www.pbs.org/video/2365245528/', + 'md5': '115223d41bd55cda8ae5cd5ed4e11497', 'info_dict': { 'id': '2365245528', 'display_id': '2365245528', @@ -299,15 +288,13 @@ class PBSIE(InfoExtractor): 'duration': 6851, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { # Video embedded in iframe containing angle brackets as attribute's value (e.g. # "