diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 000000000..128ba2fc0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,58 @@ +## Please follow the guide below + +- You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly +- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x]) +- Use *Preview* tab to see how your issue will actually look like + +--- + +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.01** + +### Before submitting an *issue* make sure you have: +- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections +- [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones + +### What is the purpose of your *issue*? +- [ ] Bug report (encountered problems with youtube-dl) +- [ ] Site support request (request for adding support for a new site) +- [ ] Feature request (request for a new functionality) +- [ ] Question +- [ ] Other + +--- + +### The following sections concretize particular purposed issues, you can erase any section (the contents between triple ---) not applicable to your *issue* + +--- + +### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows: + +Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```): +``` +$ youtube-dl -v +[debug] System config: [] +[debug] User config: [] +[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 +[debug] youtube-dl version 2016.04.01 +[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 +[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 +[debug] Proxy map: {} +... + +``` + +--- + +### If the purpose of this *issue* is a *site support request* please provide all kinds of example URLs support for which should be included (replace following example URLs by **yours**): +- Single video: https://www.youtube.com/watch?v=BaW_jenozKc +- Single video: https://youtu.be/BaW_jenozKc +- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc + +--- + +### Description of your *issue*, suggested solution and other information + +Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. +If work on your *issue* required an account credentials please provide them or explain how one can obtain them. diff --git a/.github/ISSUE_TEMPLATE_tmpl.md b/.github/ISSUE_TEMPLATE_tmpl.md new file mode 100644 index 000000000..a5e6a4233 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl.md @@ -0,0 +1,58 @@ +## Please follow the guide below + +- You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly +- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x]) +- Use *Preview* tab to see how your issue will actually look like + +--- + +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *%(version)s*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **%(version)s** + +### Before submitting an *issue* make sure you have: +- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections +- [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones + +### What is the purpose of your *issue*? +- [ ] Bug report (encountered problems with youtube-dl) +- [ ] Site support request (request for adding support for a new site) +- [ ] Feature request (request for a new functionality) +- [ ] Question +- [ ] Other + +--- + +### The following sections concretize particular purposed issues, you can erase any section (the contents between triple ---) not applicable to your *issue* + +--- + +### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows: + +Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```): +``` +$ youtube-dl -v +[debug] System config: [] +[debug] User config: [] +[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 +[debug] youtube-dl version %(version)s +[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 +[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 +[debug] Proxy map: {} +... + +``` + +--- + +### If the purpose of this *issue* is a *site support request* please provide all kinds of example URLs support for which should be included (replace following example URLs by **yours**): +- Single video: https://www.youtube.com/watch?v=BaW_jenozKc +- Single video: https://youtu.be/BaW_jenozKc +- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc + +--- + +### Description of your *issue*, suggested solution and other information + +Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. +If work on your *issue* required an account credentials please provide them or explain how one can obtain them. diff --git a/AUTHORS b/AUTHORS index aa48cd5a6..ea8d39978 100644 --- a/AUTHORS +++ b/AUTHORS @@ -163,3 +163,7 @@ Patrick Griffis Aidan Rowe mutantmonkey Ben Congdon +Kacper Michajłow +José Joaquín Atria +Viťas Strádal +Kagami Hiiragi diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c996f03ab..0df6193fb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,7 +85,7 @@ To run the test, simply invoke your favorite test runner, or execute a test file If you want to create a build of youtube-dl yourself, you'll need * python -* make +* make (both GNU make and BSD make are supported) * pandoc * zip * nosetests diff --git a/Makefile b/Makefile index e98806791..3a6c37944 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ -all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites +all: youtube-dl README.md CONTRIBUTING.md ISSUE_TEMPLATE.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete find . -name "*.class" -delete @@ -12,15 +12,7 @@ SHAREDIR ?= $(PREFIX)/share PYTHON ?= /usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local -ifeq ($(PREFIX),/usr) - SYSCONFDIR=/etc -else - ifeq ($(PREFIX),/usr/local) - SYSCONFDIR=/etc - else - SYSCONFDIR=$(PREFIX)/etc - endif -endif +SYSCONFDIR != if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish install -d $(DESTDIR)$(BINDIR) @@ -67,6 +59,9 @@ README.md: youtube_dl/*.py youtube_dl/*/*.py CONTRIBUTING.md: README.md $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md +ISSUE_TEMPLATE.md: + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl.md .github/ISSUE_TEMPLATE.md + supportedsites: $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md diff --git a/README.md b/README.md index 68db546ef..e972bf69f 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,8 @@ which means you can modify it, redistribute it or use it however you like. (e.g. 50K or 4.2M) -R, --retries RETRIES Number of retries (default is 10), or "infinite". + --fragment-retries RETRIES Number of retries for a fragment (default + is 10), or "infinite" (DASH only) --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) (default is 1024) --no-resize-buffer Do not automatically adjust the buffer @@ -376,8 +378,8 @@ which means you can modify it, redistribute it or use it however you like. --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default - --embed-subs Embed subtitles in the video (only for mkv - and mp4 videos) + --embed-subs Embed subtitles in the video (only for mp4, + webm and mkv videos) --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / @@ -598,6 +600,7 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin - `vcodec`: Name of the video codec in use - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case. `http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `m3u8`, or `m3u8_native` + - `format_id`: A short description of the format Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by video hoster. @@ -831,7 +834,7 @@ To run the test, simply invoke your favorite test runner, or execute a test file If you want to create a build of youtube-dl yourself, you'll need * python -* make +* make (both GNU make and BSD make are supported) * pandoc * zip * nosetests diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py new file mode 100644 index 000000000..b7ad23d83 --- /dev/null +++ b/devscripts/make_issue_template.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +import io +import optparse + + +def main(): + parser = optparse.OptionParser(usage='%prog INFILE OUTFILE') + options, args = parser.parse_args() + if len(args) != 2: + parser.error('Expected an input and an output filename') + + infile, outfile = args + + with io.open(infile, encoding='utf-8') as inf: + issue_template_tmpl = inf.read() + + # Get the version from youtube_dl/version.py without importing the package + exec(compile(open('youtube_dl/version.py').read(), + 'youtube_dl/version.py', 'exec')) + + out = issue_template_tmpl % {'version': locals()['__version__']} + + with io.open(outfile, 'w', encoding='utf-8') as outf: + outf.write(out) + +if __name__ == '__main__': + main() diff --git a/devscripts/release.sh b/devscripts/release.sh index 61806961c..6718ce39b 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -45,9 +45,9 @@ fi /bin/echo -e "\n### Changing version in version.py..." sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py -/bin/echo -e "\n### Committing documentation and youtube_dl/version.py..." -make README.md CONTRIBUTING.md supportedsites -git add README.md CONTRIBUTING.md docs/supportedsites.md youtube_dl/version.py +/bin/echo -e "\n### Committing documentation, templates and youtube_dl/version.py..." +make README.md CONTRIBUTING.md ISSUE_TEMPLATE.md supportedsites +git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md docs/supportedsites.md youtube_dl/version.py git commit -m "release $version" /bin/echo -e "\n### Now tagging, signing and pushing..." diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 19681a3fb..60cc8ed4a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -74,6 +74,7 @@ - **Bigflix** - **Bild**: Bild.de - **BiliBili** + - **BioBioChileTV** - **BleacherReport** - **BleacherReportCMS** - **blinkx** @@ -81,6 +82,7 @@ - **BokeCC** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk Mediathek + - **BravoTV** - **Break** - **brightcove:legacy** - **brightcove:new** @@ -99,6 +101,7 @@ - **CBSNews**: CBS News - **CBSNewsLiveVideo**: CBS News Live Videos - **CBSSports** + - **CDA** - **CeskaTelevize** - **channel9**: Channel 9 - **Chaturbate** @@ -115,6 +118,7 @@ - **Clubic** - **Clyp** - **cmt.com** + - **CNBC** - **CNET** - **CNN** - **CNNArticle** @@ -131,6 +135,7 @@ - **CrooksAndLiars** - **Crunchyroll** - **crunchyroll:playlist** + - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **culturebox.francetvinfo.fr** @@ -243,6 +248,7 @@ - **GPUTechConf** - **Groupon** - **Hark** + - **HBO** - **HearThisAt** - **Heise** - **HellPorno** @@ -343,6 +349,7 @@ - **MiTele**: mitele.es - **mixcloud** - **MLB** + - **Mnet** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - **Mojvideo** @@ -371,7 +378,8 @@ - **myvideo** (Currently broken) - **MyVidster** - **n-tv.de** - - **NationalGeographic** + - **natgeo** + - **natgeo:channel** - **Naver** - **NBA** - **NBC** @@ -439,6 +447,7 @@ - **OnionStudios** - **Ooyala** - **OoyalaExternal** + - **Openload** - **OraTV** - **orf:fm4**: radio FM4 - **orf:iptv**: iptv.ORF.at @@ -499,6 +508,7 @@ - **Restudy** - **ReverbNation** - **Revision3** + - **RICE** - **RingTV** - **RottenTomatoes** - **Roxwel** @@ -523,6 +533,7 @@ - **RUTV**: RUTV.RU - **Ruutu** - **safari**: safaribooksonline.com online video + - **safari:api** - **safari:course**: safaribooksonline.com online courses - **Sandia**: Sandia National Laboratories - **Sapo**: SAPO Vídeos @@ -610,13 +621,14 @@ - **Telegraaf** - **TeleMB** - **TeleTask** - - **TenPlay** - **TF1** - **TheIntercept** - **TheOnion** - **ThePlatform** - **ThePlatformFeed** + - **TheScene** - **TheSixtyOne** + - **TheStar** - **ThisAmericanLife** - **ThisAV** - **THVideo** @@ -650,6 +662,7 @@ - **tv.dfb.de** - **TV2** - **TV2Article** + - **TV3** - **TV4**: tv4.se and tv4play.se - **TVC** - **TVCArticle** @@ -730,6 +743,7 @@ - **vlive** - **Vodlocker** - **VoiceRepublic** + - **VoxMedia** - **Vporn** - **vpro**: npo.nl and ntr.nl - **VRT** @@ -783,6 +797,7 @@ - **youtube:channel**: YouTube.com channels - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) + - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists - **youtube:playlists**: YouTube.com user/channel playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) diff --git a/setup.cfg b/setup.cfg index 26857750c..5760112d4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,5 @@ universal = True [flake8] -exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,setup.py,build,.git +exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/make_issue_template.py,setup.py,build,.git ignore = E402,E501,E731 diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index efbee3b71..ca25025e2 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -222,6 +222,11 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'dash-video-low') + ydl = YDL({'format': 'bestvideo[format_id^=dash][format_id$=low]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'dash-video-low') + formats = [ {'format_id': 'vid-vcodec-dot', 'ext': 'mp4', 'preference': 1, 'vcodec': 'avc1.123456', 'acodec': 'none', 'url': TEST_URL}, ] diff --git a/test/test_compat.py b/test/test_compat.py index b6bfad05e..cc105807a 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -19,6 +19,7 @@ from youtube_dl.compat import ( compat_str, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, + compat_urllib_parse_urlencode, ) @@ -70,6 +71,12 @@ class TestCompat(unittest.TestCase): self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def') self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def') + def test_compat_urllib_parse_urlencode(self): + self.assertEqual(compat_urllib_parse_urlencode({'abc': 'def'}), 'abc=def') + self.assertEqual(compat_urllib_parse_urlencode({'abc': b'def'}), 'abc=def') + self.assertEqual(compat_urllib_parse_urlencode({b'abc': 'def'}), 'abc=def') + self.assertEqual(compat_urllib_parse_urlencode({b'abc': b'def'}), 'abc=def') + def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) diff --git a/test/test_http.py b/test/test_http.py index fc59b1aed..15e0ad369 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding: utf-8 from __future__ import unicode_literals # Allow direct execution @@ -120,5 +121,14 @@ class TestProxy(unittest.TestCase): response = ydl.urlopen(req).read().decode('utf-8') self.assertEqual(response, 'cn: {0}'.format(url)) + def test_proxy_with_idn(self): + ydl = YoutubeDL({ + 'proxy': 'localhost:{0}'.format(self.port), + }) + url = 'http://中文.tw/' + response = ydl.urlopen(url).read().decode('utf-8') + # b'xn--fiq228c' is '中文'.encode('idna') + self.assertEqual(response, 'normal: http://xn--fiq228c.tw/') + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index bc28ceb34..a35debfe1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -28,6 +28,7 @@ from youtube_dl.utils import ( encodeFilename, escape_rfc3986, escape_url, + extract_attributes, ExtractorError, find_xpath_attr, fix_xml_ampersands, @@ -77,6 +78,7 @@ from youtube_dl.utils import ( cli_bool_option, ) from youtube_dl.compat import ( + compat_chr, compat_etree_fromstring, compat_urlparse, compat_parse_qs, @@ -575,11 +577,11 @@ class TestUtil(unittest.TestCase): ) self.assertEqual( escape_url('http://тест.рф/фрагмент'), - 'http://тест.рф/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' + 'http://xn--e1aybc.xn--p1ai/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' ) self.assertEqual( escape_url('http://тест.рф/абв?абв=абв#абв'), - 'http://тест.рф/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' + 'http://xn--e1aybc.xn--p1ai/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' ) self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') @@ -629,6 +631,44 @@ class TestUtil(unittest.TestCase): on = js_to_json('{"abc": "def",}') self.assertEqual(json.loads(on), {'abc': 'def'}) + def test_extract_attributes(self): + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': "a 'b' c"}) + self.assertEqual(extract_attributes(''), {'x': 'a "b" c'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '&'}) # XML + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes(''), {'x': '&foo'}) + self.assertEqual(extract_attributes(''), {'x': "'"}) + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': None}) + self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) + self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) + self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'}) + self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'}) + # "Narrow" Python builds don't support unicode code points outside BMP. + try: + compat_chr(0x10000) + supports_outside_bmp = True + except ValueError: + supports_outside_bmp = False + if supports_outside_bmp: + self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'}) + def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') @@ -662,6 +702,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_count('1.000'), 1000) self.assertEqual(parse_count('1.1k'), 1100) self.assertEqual(parse_count('1.1kk'), 1100000) + self.assertEqual(parse_count('1.1kk '), 1100000) + self.assertEqual(parse_count('1.1kk views'), 1100000) def test_version_tuple(self): self.assertEqual(version_tuple('1'), (1,)) diff --git a/tox.ini b/tox.ini index 48504329f..2d7134005 100644 --- a/tox.ini +++ b/tox.ini @@ -8,6 +8,6 @@ deps = passenv = HOME defaultargs = test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py - --exclude test_youtube_lists.py + --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=youtube_dl --cover-html # test.test_download:TestDownload.test_NowVideo diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8c651cd52..d7aa951ff 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -39,6 +39,8 @@ from .compat import ( compat_urllib_request_DataHandler, ) from .utils import ( + age_restricted, + args_to_str, ContentTooShortError, date_from_str, DateRange, @@ -58,13 +60,16 @@ from .utils import ( PagedList, parse_filesize, PerRequestProxyHandler, - PostProcessingError, platform_name, + PostProcessingError, preferredencoding, + prepend_extension, render_table, + replace_extension, SameFileError, sanitize_filename, sanitize_path, + sanitize_url, sanitized_Request, std_headers, subtitles_filename, @@ -75,10 +80,6 @@ from .utils import ( write_string, YoutubeDLCookieProcessor, YoutubeDLHandler, - prepend_extension, - replace_extension, - args_to_str, - age_restricted, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractors @@ -905,7 +906,7 @@ class YoutubeDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?Pext|acodec|vcodec|container|protocol) + \s*(?Pext|acodec|vcodec|container|protocol|format_id) \s*(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9._-]+) \s*$ @@ -1229,6 +1230,7 @@ class YoutubeDL(object): t.get('preference'), t.get('width'), t.get('height'), t.get('id'), t.get('url'))) for i, t in enumerate(thumbnails): + t['url'] = sanitize_url(t['url']) if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) if t.get('id') is None: @@ -1263,6 +1265,8 @@ class YoutubeDL(object): if subtitles: for _, subtitle in subtitles.items(): for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) if 'ext' not in subtitle_format: subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() @@ -1292,6 +1296,8 @@ class YoutubeDL(object): if 'url' not in format: raise ExtractorError('Missing "url" key in result (index %d)' % i) + format['url'] = sanitize_url(format['url']) + if format.get('format_id') is None: format['format_id'] = compat_str(i) else: @@ -1836,7 +1842,7 @@ class YoutubeDL(object): if fdict.get('language'): if res: res += ' ' - res += '[%s]' % fdict['language'] + res += '[%s] ' % fdict['language'] if fdict.get('format_note') is not None: res += fdict['format_note'] + ' ' if fdict.get('tbr') is not None: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 79b389840..737f6545d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -144,14 +144,20 @@ def _real_main(argv=None): if numeric_limit is None: parser.error('invalid max_filesize specified') opts.max_filesize = numeric_limit - if opts.retries is not None: - if opts.retries in ('inf', 'infinite'): - opts_retries = float('inf') + + def parse_retries(retries): + if retries in ('inf', 'infinite'): + parsed_retries = float('inf') else: try: - opts_retries = int(opts.retries) + parsed_retries = int(retries) except (TypeError, ValueError): parser.error('invalid retry count specified') + return parsed_retries + if opts.retries is not None: + opts.retries = parse_retries(opts.retries) + if opts.fragment_retries is not None: + opts.fragment_retries = parse_retries(opts.fragment_retries) if opts.buffersize is not None: numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) if numeric_buffersize is None: @@ -299,7 +305,8 @@ def _real_main(argv=None): 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, 'nooverwrites': opts.nooverwrites, - 'retries': opts_retries, + 'retries': opts.retries, + 'fragment_retries': opts.fragment_retries, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, 'continuedl': opts.continue_dl, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2771fb5fa..76b6b0e38 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -77,6 +77,11 @@ try: except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +try: + from html.parser import HTMLParser as compat_HTMLParser +except ImportError: # Python 2 + from HTMLParser import HTMLParser as compat_HTMLParser + try: from subprocess import DEVNULL @@ -164,6 +169,31 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) +try: + from urllib.parse import urlencode as compat_urllib_parse_urlencode +except ImportError: # Python 2 + # Python 2 will choke in urlencode on mixture of byte and unicode strings. + # Possible solutions are to either port it from python 3 with all + # the friends or manually ensure input query contains only byte strings. + # We will stick with latter thus recursively encoding the whole query. + def compat_urllib_parse_urlencode(query, doseq=0, encoding='utf-8'): + def encode_elem(e): + if isinstance(e, dict): + e = encode_dict(e) + elif isinstance(e, (list, tuple,)): + e = encode_list(e) + elif isinstance(e, compat_str): + e = e.encode(encoding) + return e + + def encode_dict(d): + return dict((encode_elem(k), encode_elem(v)) for k, v in d.items()) + + def encode_list(l): + return [encode_elem(e) for e in l] + + return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) + try: from urllib.request import DataHandler as compat_urllib_request_DataHandler except ImportError: # Python < 3.4 @@ -251,6 +281,16 @@ else: el.text = el.text.decode('utf-8') return doc +if sys.version_info < (2, 7): + # Here comes the crazy part: In 2.6, if the xpath is a unicode, + # .//node does not match if a node is a direct child of . ! + def compat_xpath(xpath): + if isinstance(xpath, compat_str): + xpath = xpath.encode('ascii') + return xpath +else: + compat_xpath = lambda xpath: xpath + try: from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 @@ -543,6 +583,7 @@ else: from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ + 'compat_HTMLParser', 'compat_HTTPError', 'compat_basestring', 'compat_chr', @@ -572,6 +613,7 @@ __all__ = [ 'compat_urllib_parse_unquote', 'compat_urllib_parse_unquote_plus', 'compat_urllib_parse_unquote_to_bytes', + 'compat_urllib_parse_urlencode', 'compat_urllib_parse_urlparse', 'compat_urllib_request', 'compat_urllib_request_DataHandler', @@ -579,6 +621,7 @@ __all__ = [ 'compat_urlparse', 'compat_urlretrieve', 'compat_xml_parse_error', + 'compat_xpath', 'shlex_quote', 'subprocess_check_output', 'workaround_optparse_bug9161', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index f39db58f6..1dba9f49a 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -115,6 +115,10 @@ class FileDownloader(object): return '%10s' % '---b/s' return '%10s' % ('%s/s' % format_bytes(speed)) + @staticmethod + def format_retries(retries): + return 'inf' if retries == float('inf') else '%.0f' % retries + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) @@ -297,7 +301,9 @@ class FileDownloader(object): def report_retry(self, count, retries): """Report retry in case of HTTP error 5xx""" - self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %.0f)...' % (count, retries)) + self.to_screen( + '[download] Got server HTTP error. Retrying (attempt %d of %s)...' + % (count, self.format_retries(retries))) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8b1b17c6e..8bbab9dbc 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -4,6 +4,7 @@ import os import re from .fragment import FragmentFD +from ..compat import compat_urllib_error from ..utils import ( sanitize_open, encodeFilename, @@ -36,20 +37,41 @@ class DashSegmentsFD(FragmentFD): segments_filenames = [] - def append_url_to_file(target_url, target_filename): - success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) - if not success: + fragment_retries = self.params.get('fragment_retries', 0) + + def append_url_to_file(target_url, tmp_filename, segment_name): + target_filename = '%s-%s' % (tmp_filename, segment_name) + count = 0 + while count <= fragment_retries: + try: + success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) + if not success: + return False + down, target_sanitized = sanitize_open(target_filename, 'rb') + ctx['dest_stream'].write(down.read()) + down.close() + segments_filenames.append(target_sanitized) + break + except (compat_urllib_error.HTTPError, ) as err: + # YouTube may often return 404 HTTP error for a fragment causing the + # whole download to fail. However if the same fragment is immediately + # retried with the same request data this usually succeeds (1-2 attemps + # is usually enough) thus allowing to download the whole file successfully. + # So, we will retry all fragments that fail with 404 HTTP error for now. + if err.code != 404: + raise + # Retry fragment + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(segment_name, count, fragment_retries) + if count > fragment_retries: + self.report_error('giving up after %s fragment retries' % fragment_retries) return False - down, target_sanitized = sanitize_open(target_filename, 'rb') - ctx['dest_stream'].write(down.read()) - down.close() - segments_filenames.append(target_sanitized) if initialization_url: - append_url_to_file(initialization_url, ctx['tmpfilename'] + '-Init') + append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init') for i, segment_url in enumerate(segment_urls): - segment_filename = '%s-Seg%d' % (ctx['tmpfilename'], i) - append_url_to_file(segment_url, segment_filename) + append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i) self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index fc9642905..664d87543 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -223,6 +223,12 @@ def write_metadata_tag(stream, metadata): write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata)) +def remove_encrypted_media(media): + return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and + 'drmAdditionalHeaderSetId' not in e.attrib, + media)) + + def _add_ns(prop): return '{http://ns.adobe.com/f4m/1.0}%s' % prop @@ -244,9 +250,7 @@ class F4mFD(FragmentFD): # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute if 'id' not in e.attrib: self.report_error('Missing ID in f4m DRM') - media = list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and - 'drmAdditionalHeaderSetId' not in e.attrib, - media)) + media = remove_encrypted_media(media) if not media: self.report_error('Unsupported DRM') return media diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index a5bae9669..ba903ae10 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -19,8 +19,17 @@ class HttpQuietDownloader(HttpFD): class FragmentFD(FileDownloader): """ A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + + Available options: + + fragment_retries: Number of times to retry a fragment for HTTP error (DASH only) """ + def report_retry_fragment(self, fragment_name, count, retries): + self.to_screen( + '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %s)...' + % (fragment_name, count, self.format_retries(retries))) + def _prepare_and_start_frag_download(self, ctx): self._prepare_frag_download(ctx) self._start_frag_download(ctx) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 2a29451eb..f4a2ecaa9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -72,6 +72,7 @@ from .bet import BetIE from .bigflix import BigflixIE from .bild import BildIE from .bilibili import BiliBiliIE +from .biobiochiletv import BioBioChileTVIE from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, @@ -81,6 +82,7 @@ from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bpb import BpbIE from .br import BRIE +from .bravotv import BravoTVIE from .breakcom import BreakIE from .brightcove import ( BrightcoveLegacyIE, @@ -93,6 +95,7 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) +from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import CanvasIE @@ -101,12 +104,14 @@ from .cbc import ( CBCPlayerIE, ) from .cbs import CBSIE +from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsIE, CBSNewsLiveVideoIE, ) from .cbssports import CBSSportsIE from .ccc import CCCIE +from .cda import CDAIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chaturbate import ChaturbateIE @@ -124,7 +129,7 @@ from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE from .cmt import CMTIE -from .cnet import CNETIE +from .cnbc import CNBCIE from .cnn import ( CNNIE, CNNBlogsIE, @@ -135,6 +140,7 @@ from .collegerama import CollegeRamaIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import RtmpIE from .condenast import CondeNastIE from .cracked import CrackedIE from .crackle import CrackleIE @@ -282,6 +288,7 @@ from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE from .groupon import GrouponIE from .hark import HarkIE +from .hbo import HBOIE from .hearthisat import HearThisAtIE from .heise import HeiseIE from .hellporno import HellPornoIE @@ -405,6 +412,7 @@ from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE from .mixcloud import MixcloudIE from .mlb import MLBIE +from .mnet import MnetIE from .mpora import MporaIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE @@ -431,10 +439,14 @@ from .myspass import MySpassIE from .myvi import MyviIE from .myvideo import MyVideoIE from .myvidster import MyVidsterIE -from .nationalgeographic import NationalGeographicIE +from .nationalgeographic import ( + NationalGeographicIE, + NationalGeographicChannelIE, +) from .naver import NaverIE from .nba import NBAIE from .nbc import ( + CSNNEIE, NBCIE, NBCNewsIE, NBCSportsIE, @@ -530,6 +542,7 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) +from .openload import OpenloadIE from .ora import OraTVIE from .orf import ( ORFTVthekIE, @@ -625,6 +638,7 @@ from .ruutu import RuutuIE from .sandia import SandiaIE from .safari import ( SafariIE, + SafariApiIE, SafariCourseIE, ) from .sapo import SapoIE @@ -727,7 +741,6 @@ from .telecinco import TelecincoIE from .telegraaf import TelegraafIE from .telemb import TeleMBIE from .teletask import TeleTaskIE -from .tenplay import TenPlayIE from .testurl import TestURLIE from .tf1 import TF1IE from .theintercept import TheInterceptIE @@ -736,7 +749,9 @@ from .theplatform import ( ThePlatformIE, ThePlatformFeedIE, ) +from .thescene import TheSceneIE from .thesixtyone import TheSixtyOneIE +from .thestar import TheStarIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .tinypic import TinyPicIE @@ -783,6 +798,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, ) +from .tv3 import TV3IE from .tv4 import TV4IE from .tvc import ( TVCIE, @@ -890,6 +906,7 @@ from .vk import ( from .vlive import VLiveIE from .vodlocker import VodlockerIE from .voicerepublic import VoiceRepublicIE +from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE from .vube import VubeIE @@ -951,7 +968,9 @@ from .youtube import ( YoutubeChannelIE, YoutubeFavouritesIE, YoutubeHistoryIE, + YoutubeLiveIE, YoutubePlaylistIE, + YoutubePlaylistsIE, YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, @@ -961,7 +980,6 @@ from .youtube import ( YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, - YoutubePlaylistsIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 6a29e587f..b584277be 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -12,7 +12,7 @@ from ..utils import ( class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' + _VALID_URL = r'https?://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py index 122dc9099..c04949c21 100644 --- a/youtube_dl/extractor/abc7news.py +++ b/youtube_dl/extractor/abc7news.py @@ -44,6 +44,7 @@ class Abc7NewsIE(InfoExtractor): 'contentURL', webpage, 'm3u8 url', fatal=True) formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') + self._sort_formats(formats) title = self._og_search_title(webpage).strip() description = self._og_search_description(webpage).strip() diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index e3e6d2113..55a9322a7 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -6,7 +6,7 @@ from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -16,7 +16,7 @@ from ..utils import ( class AddAnimeIE(InfoExtractor): - _VALID_URL = r'http://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' + _VALID_URL = r'https?://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' _TESTS = [{ 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', 'md5': '72954ea10bc979ab5e2eb288b21425a0', @@ -60,7 +60,7 @@ class AddAnimeIE(InfoExtractor): confirm_url = ( parsed_url.scheme + '://' + parsed_url.netloc + action + '?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) self._download_webpage( confirm_url, video_id, diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 6018ae79a..1bbfe2641 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -1,13 +1,19 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + smuggle_url, + update_url_query, + unescapeHTML, +) class AENetworksIE(InfoExtractor): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:[^/]+/)+(?P[^/]+?)(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P[^/]+)/(?:[^/]+/)+(?P[^/]+?)(?:$|[?#])' _TESTS = [{ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', @@ -16,6 +22,9 @@ class AENetworksIE(InfoExtractor): 'ext': 'mp4', 'title': "Bet You Didn't Know: Valentine's Day", 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', + 'timestamp': 1375819729, + 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -25,15 +34,15 @@ class AENetworksIE(InfoExtractor): 'expected_warnings': ['JSON-LD'], }, { 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', + 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', 'info_dict': { 'id': 'eg47EERs_JsZ', 'ext': 'mp4', 'title': 'Winter Is Coming', 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'timestamp': 1338306241, + 'upload_date': '20120529', + 'uploader': 'AENE-NEW', }, 'add_ie': ['ThePlatform'], }, { @@ -48,7 +57,7 @@ class AENetworksIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + page_type, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) @@ -56,11 +65,23 @@ class AENetworksIE(InfoExtractor): r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, r"media_url\s*=\s*'([^']+)'" ] - video_url = self._search_regex(video_url_re, webpage, 'video url') + video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url')) + query = {'mbr': 'true'} + if page_type == 'shows': + query['assetTypes'] = 'medium_video_s3' + if 'switch=hds' in video_url: + query['switch'] = 'hls' info = self._search_json_ld(webpage, video_id, fatal=False) info.update({ '_type': 'url_transparent', - 'url': smuggle_url(video_url, {'sig': {'key': 'crazyjava', 'secret': 's3cr3t'}}), + 'url': smuggle_url( + update_url_query(video_url, query), + { + 'sig': { + 'key': 'crazyjava', + 'secret': 's3cr3t'}, + 'force_smil_url': True + }), }) return info diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index e0518cf26..d548592fe 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' + _VALID_URL = r'https?://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' _TEST = { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 5b2c0dc9a..b081695d8 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'http://www\.aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' + _VALID_URL = r'https?://www\.aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', @@ -13,24 +13,18 @@ class AlJazeeraIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Slum - Episode 1: Deliverance', 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader': 'Al Jazeera English', + 'uploader_id': '665003303001', + 'timestamp': 1411116829, + 'upload_date': '20140919', }, - 'add_ie': ['BrightcoveLegacy'], + 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' def _real_extract(self, url): program_name = self._match_id(url) webpage = self._download_webpage(url, program_name) brightcove_id = self._search_regex( r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - - return { - '_type': 'url', - 'url': ( - 'brightcove:' - 'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc' - '&%40videoPlayer={0}'.format(brightcove_id) - ), - 'ie_key': 'BrightcoveLegacy', - } + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 69e6baff7..138fa0808 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -69,12 +69,14 @@ class AMPIE(InfoExtractor): self._sort_formats(formats) + timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + return { 'id': video_id, 'title': get_media_node('title'), 'description': get_media_node('description'), 'thumbnails': thumbnails, - 'timestamp': parse_iso8601(item.get('pubDate'), ' '), + 'timestamp': timestamp, 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), 'subtitles': subtitles, 'formats': formats, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index a7d8daf7b..9b01e38f5 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_str, +) from ..utils import ( determine_ext, - encode_dict, + extract_attributes, ExtractorError, sanitized_Request, urlencode_postdata, @@ -18,7 +21,7 @@ class AnimeOnDemandIE(InfoExtractor): _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' - _TEST = { + _TESTS = [{ 'url': 'https://www.anime-on-demand.de/anime/161', 'info_dict': { 'id': '161', @@ -26,7 +29,19 @@ class AnimeOnDemandIE(InfoExtractor): 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', }, 'playlist_mincount': 4, - } + }, { + # Film wording is used instead of Episode + 'url': 'https://www.anime-on-demand.de/anime/39', + 'only_matching': True, + }, { + # Episodes without titles + 'url': 'https://www.anime-on-demand.de/anime/162', + 'only_matching': True, + }, { + # ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/169', + 'only_matching': True, + }] def _login(self): (username, password) = self._get_login_info() @@ -36,6 +51,10 @@ class AnimeOnDemandIE(InfoExtractor): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') + if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page: + self.raise_geo_restricted( + '%s is only available in German-speaking countries of Europe' % self.IE_NAME) + login_form = self._form_hidden_inputs('new_user', login_page) login_form.update({ @@ -51,7 +70,7 @@ class AnimeOnDemandIE(InfoExtractor): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) request = sanitized_Request( - post_url, urlencode_postdata(encode_dict(login_form))) + post_url, urlencode_postdata(login_form)) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( @@ -91,14 +110,22 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] - for episode_html in re.findall(r'(?s)]+class="episodebox-title".+?>Episodeninhalt<', webpage): - m = re.search( - r'class="episodebox-title"[^>]+title="Episode (?P\d+) - (?P.+?)"', episode_html) - if not m: + for num, episode_html in enumerate(re.findall( + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1): + episodebox_title = self._search_regex( + (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', + r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), + episode_html, 'episodebox title', default=None, group='title') + if not episodebox_title: continue - episode_number = int(m.group('number')) - episode_title = m.group('title') + episode_number = int(self._search_regex( + r'(?:Episode|Film)\s*(\d+)', + episodebox_title, 'episode number', default=num)) + episode_title = self._search_regex( + r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', + episodebox_title, 'episode title', default=None) + video_id = 'episode-%d' % episode_number common_info = { @@ -110,33 +137,86 @@ class AnimeOnDemandIE(InfoExtractor): formats = [] - playlist_url = self._search_regex( - r'data-playlist=(["\'])(?P<url>.+?)\1', - episode_html, 'data playlist', default=None, group='url') - if playlist_url: - request = sanitized_Request( - compat_urlparse.urljoin(url, playlist_url), - headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRF-Token': csrf_token, - 'Referer': url, - 'Accept': 'application/json, text/javascript, */*; q=0.01', - }) + for input_ in re.findall( + r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html): + attributes = extract_attributes(input_) + playlist_urls = [] + for playlist_key in ('data-playlist', 'data-otherplaylist'): + playlist_url = attributes.get(playlist_key) + if isinstance(playlist_url, compat_str) and re.match( + r'/?[\da-zA-Z]+', playlist_url): + playlist_urls.append(attributes[playlist_key]) + if not playlist_urls: + continue - playlist = self._download_json( - request, video_id, 'Downloading playlist JSON', fatal=False) - if playlist: - playlist = playlist['playlist'][0] - title = playlist['title'] + lang = attributes.get('data-lang') + lang_note = attributes.get('value') + + for playlist_url in playlist_urls: + kind = self._search_regex( + r'videomaterialurl/\d+/([^/]+)/', + playlist_url, 'media kind', default=None) + format_id_list = [] + if lang: + format_id_list.append(lang) + if kind: + format_id_list.append(kind) + if not format_id_list: + format_id_list.append(compat_str(num)) + format_id = '-'.join(format_id_list) + format_note = ', '.join(filter(None, (kind, lang_note))) + request = sanitized_Request( + compat_urlparse.urljoin(url, playlist_url), + headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRF-Token': csrf_token, + 'Referer': url, + 'Accept': 'application/json, text/javascript, */*; q=0.01', + }) + playlist = self._download_json( + request, video_id, 'Downloading %s playlist JSON' % format_id, + fatal=False) + if not playlist: + continue + start_video = playlist.get('startvideo', 0) + playlist = playlist.get('playlist') + if not playlist or not isinstance(playlist, list): + continue + playlist = playlist[start_video] + title = playlist.get('title') + if not title: + continue description = playlist.get('description') for source in playlist.get('sources', []): file_ = source.get('file') - if file_ and determine_ext(file_) == 'm3u8': - formats = self._extract_m3u8_formats( + if not file_: + continue + ext = determine_ext(file_) + format_id_list = [lang, kind] + if ext == 'm3u8': + format_id_list.append('hls') + elif source.get('type') == 'video/dash' or ext == 'mpd': + format_id_list.append('dash') + format_id = '-'.join(filter(None, format_id_list)) + if ext == 'm3u8': + file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') + entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) + elif source.get('type') == 'video/dash' or ext == 'mpd': + continue + file_formats = self._extract_mpd_formats( + file_, video_id, mpd_id=format_id, fatal=False) + else: + continue + for f in file_formats: + f.update({ + 'language': lang, + 'format_note': format_note, + }) + formats.extend(file_formats) if formats: + self._sort_formats(formats) f = common_info.copy() f.update({ 'title': title, @@ -145,16 +225,18 @@ class AnimeOnDemandIE(InfoExtractor): }) entries.append(f) - m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', - episode_html) - if m: - f = common_info.copy() - f.update({ - 'id': '%s-teaser' % f['id'], - 'title': m.group('title'), - 'url': compat_urlparse.urljoin(url, m.group('href')), - }) - entries.append(f) + # Extract teaser only when full episode is not available + if not formats: + m = re.search( + r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', + episode_html) + if m: + f = common_info.copy() + f.update({ + 'id': '%s-teaser' % f['id'], + 'title': m.group('title'), + 'url': compat_urlparse.urljoin(url, m.group('href')), + }) + entries.append(f) return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b761b2cc4..95a99c6b0 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|http://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)' _TESTS = [{ 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', @@ -25,7 +25,7 @@ class AolIE(InfoExtractor): class AolFeaturesIE(InfoExtractor): IE_NAME = 'features.aol.com' - _VALID_URL = r'http://features\.aol\.com/video/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://features\.aol\.com/video/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 3e119e21b..ae0f27dcb 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -23,7 +23,7 @@ from ..utils import ( class ArteTvIE(InfoExtractor): - _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' + _VALID_URL = r'https?://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' IE_NAME = 'arte.tv' def _real_extract(self, url): diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index b8f9ae005..d2f388964 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -6,16 +6,14 @@ import hashlib import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from ..compat import compat_str from ..utils import ( - int_or_none, - float_or_none, - sanitized_Request, - xpath_text, ExtractorError, + float_or_none, + int_or_none, + sanitized_Request, + urlencode_postdata, + xpath_text, ) @@ -86,7 +84,7 @@ class AtresPlayerIE(InfoExtractor): } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') response = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index 011edf128..efa624de1 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -98,7 +98,7 @@ class AzubuIE(InfoExtractor): class AzubuLiveIE(InfoExtractor): - _VALID_URL = r'http://www.azubu.tv/(?P<id>[^/]+)$' + _VALID_URL = r'https?://www.azubu.tv/(?P<id>[^/]+)$' _TEST = { 'url': 'http://www.azubu.tv/MarsTVMDLen', @@ -120,6 +120,7 @@ class AzubuLiveIE(InfoExtractor): bc_info = self._download_json(req, user) m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS') formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4') + self._sort_formats(formats) return { 'id': info['id'], diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py index 76b21e596..234a661d3 100644 --- a/youtube_dl/extractor/baidu.py +++ b/youtube_dl/extractor/baidu.py @@ -9,7 +9,7 @@ from ..utils import unescapeHTML class BaiduVideoIE(InfoExtractor): IE_DESC = '百度视频' - _VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' + _VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' _TESTS = [{ 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', 'info_dict': { diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index da986e063..0eb1930c2 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -4,15 +4,13 @@ import re import itertools from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_str, -) +from ..compat import compat_str from ..utils import ( ExtractorError, - int_or_none, float_or_none, + int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -58,7 +56,7 @@ class BambuserIE(InfoExtractor): } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index e62b3860e..425f08f2b 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -328,6 +328,7 @@ class BBCCoUkIE(InfoExtractor): 'format_id': '%s_%s' % (service, format['format_id']), 'abr': abr, 'acodec': acodec, + 'vcodec': 'none', }) formats.extend(conn_formats) return formats @@ -688,6 +689,10 @@ class BBCIE(BBCCoUkIE): # custom redirection to www.bbc.com 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', 'only_matching': True, + }, { + # single video article embedded with data-media-vpid + 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', + 'only_matching': True, }] @classmethod @@ -817,7 +822,7 @@ class BBCIE(BBCCoUkIE): # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( - [r'data-video-player-vpid="(%s)"' % self._ID_REGEX, + [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], webpage, 'vpid', default=None) @@ -942,7 +947,7 @@ class BBCIE(BBCCoUkIE): class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index 34c2a756f..9072949dd 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -34,7 +34,7 @@ class BeegIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'https://api.beeg.com/api/v5/video/%s' % video_id, video_id) + 'https://api.beeg.com/api/v6/1738/video/%s' % video_id, video_id) def split(o, e): def cut(s, x): @@ -50,8 +50,8 @@ class BeegIE(InfoExtractor): return n def decrypt_key(key): - # Reverse engineered from http://static.beeg.com/cpl/1105.js - a = '5ShMcIQlssOd7zChAIOlmeTZDaUxULbJRnywYaiB' + # Reverse engineered from http://static.beeg.com/cpl/1738.js + a = 'GUuyodcfS8FW8gQp4OKLMsZBcX0T7B' e = compat_urllib_parse_unquote(key) o = ''.join([ compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21) diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py index 1bdc25812..9bca853b3 100644 --- a/youtube_dl/extractor/behindkink.py +++ b/youtube_dl/extractor/behindkink.py @@ -8,7 +8,7 @@ from ..utils import url_basename class BehindKinkIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' + _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' _TEST = { 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', 'md5': '507b57d8fdcd75a41a9a7bdb7989c762', diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index 03dad4636..986245bf0 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -94,6 +94,7 @@ class BetIE(InfoExtractor): xpath_with_ns('./media:thumbnail', NS_MAP)).get('url') formats = self._extract_smil_formats(smil_url, display_id) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 59beb11bc..8baff2041 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -14,7 +14,7 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?' + _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py new file mode 100644 index 000000000..133228133 --- /dev/null +++ b/youtube_dl/extractor/biobiochiletv.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class BioBioChileTVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?:[^/]+/)+(?P<id>[^/]+)\.shtml' + + _TESTS = [{ + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', + 'md5': '26f51f03cf580265defefb4518faec09', + 'info_dict': { + 'id': 'sobre-camaras-y-camarillas-parlamentarias', + 'ext': 'mp4', + 'title': 'Sobre Cámaras y camarillas parlamentarias', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Fernando Atria', + }, + }, { + # different uploader layout + 'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml', + 'md5': 'edc2e6b58974c46d5b047dea3c539ff3', + 'info_dict': { + 'id': 'natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades', + 'ext': 'mp4', + 'title': 'Natalia Valdebenito repasa a diputado Hasbún: Pasó a la categoría de hablar brutalidades', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Piangella Obrador', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV') + + file_url = self._search_regex( + r'loadFWPlayerVideo\([^,]+,\s*(["\'])(?P<url>.+?)\1', + webpage, 'file url', group='url') + + base_url = self._search_regex( + r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*fileURL', webpage, + 'base url', default='http://unlimited2-cl.digitalproserver.com/bbtv/', + group='url') + + formats = self._extract_m3u8_formats( + '%s%s/playlist.m3u8' % (base_url, file_url), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + f = { + 'url': '%s%s' % (base_url, file_url), + 'format_id': 'http', + 'protocol': 'http', + 'preference': 1, + } + if formats: + f_copy = formats[-1].copy() + f_copy.update(f) + f = f_copy + formats.append(f) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + uploader = self._html_search_regex( + r'<a[^>]+href=["\']https?://busca\.biobiochile\.cl/author[^>]+>(.+?)</a>', + webpage, 'uploader', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'formats': formats, + } diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py index 122a1cbb6..86a7f4d7d 100644 --- a/youtube_dl/extractor/bokecc.py +++ b/youtube_dl/extractor/bokecc.py @@ -33,7 +33,7 @@ class BokeCCBaseIE(InfoExtractor): class BokeCCIE(BokeCCBaseIE): _IE_DESC = 'CC视频' - _VALID_URL = r'http://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' + _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' _TESTS = [{ 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B', diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index c28e72927..6ad45a1e6 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -12,7 +12,7 @@ from ..utils import ( class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'http://www\.bpb\.de/mediathek/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://www\.bpb\.de/mediathek/(?P<id>[0-9]+)/' _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py new file mode 100644 index 000000000..541c76944 --- /dev/null +++ b/youtube_dl/extractor/bravotv.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class BravoTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+videos/(?P<id>[^/?]+)' + _TEST = { + 'url': 'http://www.bravotv.com/last-chance-kitchen/season-5/videos/lck-ep-12-fishy-finale', + 'md5': 'd60cdf68904e854fac669bd26cccf801', + 'info_dict': { + 'id': 'LitrBdX64qLn', + 'ext': 'mp4', + 'title': 'Last Chance Kitchen Returns', + 'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13', + 'timestamp': 1448926740, + 'upload_date': '20151130', + 'uploader': 'NBCU-BRAV', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid') + release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid') + return self.url_result(smuggle_url( + 'http://link.theplatform.com/s/%s/%s?mbr=true&switch=progressive' % (account_pid, release_pid), + {'force_smil_url': True}), 'ThePlatform', release_pid) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index aa08051b1..725859b4d 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -11,7 +11,7 @@ from ..utils import ( class BreakIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', 'info_dict': { diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f8413d5f2..6128b6762 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,7 +9,6 @@ from ..compat import ( compat_etree_fromstring, compat_parse_qs, compat_str, - compat_urllib_parse, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, @@ -24,16 +23,16 @@ from ..utils import ( js_to_json, int_or_none, parse_iso8601, - sanitized_Request, unescapeHTML, unsmuggle_url, + update_url_query, ) class BrightcoveLegacyIE(InfoExtractor): IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' - _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' + _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated' _TESTS = [ { @@ -47,6 +46,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', 'uploader': '8TV', 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'timestamp': 1368213670, + 'upload_date': '20130510', + 'uploader_id': 1589608506001, } }, { @@ -58,6 +60,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', 'uploader': 'Oracle', + 'timestamp': 1344975024, + 'upload_date': '20120814', + 'uploader_id': 1460825906, }, }, { @@ -69,6 +74,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'This Bracelet Acts as a Personal Thermostat', 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', 'uploader': 'Mashable', + 'timestamp': 1382041798, + 'upload_date': '20131017', + 'uploader_id': 1130468786001, }, }, { @@ -86,14 +94,17 @@ class BrightcoveLegacyIE(InfoExtractor): { # test flv videos served by akamaihd.net # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william - 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', # The md5 checksum changes on each download 'info_dict': { - 'id': '2996102916001', + 'id': '3750436379001', 'ext': 'flv', 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', - 'uploader': 'Red Bull TV', + 'uploader': 'RBTV Old (do not use)', 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'timestamp': 1409122195, + 'upload_date': '20140827', + 'uploader_id': 710858724001, }, }, { @@ -107,6 +118,12 @@ class BrightcoveLegacyIE(InfoExtractor): 'playlist_mincount': 7, }, ] + FLV_VCODECS = { + 1: 'SORENSON', + 2: 'ON2', + 3: 'H264', + 4: 'VP8', + } @classmethod def _build_brighcove_url(cls, object_str): @@ -137,13 +154,16 @@ class BrightcoveLegacyIE(InfoExtractor): else: flashvars = {} + data_url = object_doc.attrib.get('data', '') + data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query) + def find_param(name): if name in flashvars: return flashvars[name] node = find_xpath_attr(object_doc, './param', 'name', name) if node is not None: return node.attrib['value'] - return None + return data_url_params.get(name) params = {} @@ -156,8 +176,8 @@ class BrightcoveLegacyIE(InfoExtractor): # Not all pages define this value if playerKey is not None: params['playerKey'] = playerKey - # The three fields hold the id of the video - videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') + # These fields hold the id of the video + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') if videoPlayer is not None: params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') @@ -185,8 +205,7 @@ class BrightcoveLegacyIE(InfoExtractor): @classmethod def _make_brightcove_url(cls, params): - data = compat_urllib_parse.urlencode(params) - return cls._FEDERATED_URL_TEMPLATE % data + return update_url_query(cls._FEDERATED_URL, params) @classmethod def _extract_brightcove_url(cls, webpage): @@ -240,7 +259,7 @@ class BrightcoveLegacyIE(InfoExtractor): # We set the original url as the default 'Referer' header referer = smuggled_data.get('Referer', url) return self._get_video_info( - videoPlayer[0], query_str, query, referer=referer) + videoPlayer[0], query, referer=referer) elif 'playerKey' in query: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) @@ -249,15 +268,14 @@ class BrightcoveLegacyIE(InfoExtractor): 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', expected=True) - def _get_video_info(self, video_id, query_str, query, referer=None): - request_url = self._FEDERATED_URL_TEMPLATE % query_str - req = sanitized_Request(request_url) + def _get_video_info(self, video_id, query, referer=None): + headers = {} linkBase = query.get('linkBaseURL') if linkBase is not None: referer = linkBase[0] if referer is not None: - req.add_header('Referer', referer) - webpage = self._download_webpage(req, video_id) + headers['Referer'] = referer + webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query) error_msg = self._html_search_regex( r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage, @@ -295,9 +313,12 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), + 'uploader_id': video_info.get('publisherId'), + 'duration': float_or_none(video_info.get('length'), 1000), + 'timestamp': int_or_none(video_info.get('creationDate'), 1000), } - renditions = video_info.get('renditions') + renditions = video_info.get('renditions', []) + video_info.get('IOSRenditions', []) if renditions: formats = [] for rend in renditions: @@ -318,19 +339,42 @@ class BrightcoveLegacyIE(InfoExtractor): ext = 'flv' if ext is None: ext = determine_ext(url) - size = rend.get('size') - formats.append({ + tbr = int_or_none(rend.get('encodingRate'), 1000), + a_format = { + 'format_id': 'http%s' % ('-%s' % tbr if tbr else ''), 'url': url, 'ext': ext, - 'height': rend.get('frameHeight'), - 'width': rend.get('frameWidth'), - 'filesize': size if size != 0 else None, - }) + 'filesize': int_or_none(rend.get('size')) or None, + 'tbr': tbr, + } + if rend.get('audioOnly'): + a_format.update({ + 'vcodec': 'none', + }) + else: + a_format.update({ + 'height': int_or_none(rend.get('frameHeight')), + 'width': int_or_none(rend.get('frameWidth')), + 'vcodec': rend.get('videoCodec'), + }) + + # m3u8 manifests with remote == false are media playlists + # Not calling _extract_m3u8_formats here to save network traffic + if ext == 'm3u8': + a_format.update({ + 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''), + 'ext': 'mp4', + 'protocol': 'm3u8', + }) + + formats.append(a_format) self._sort_formats(formats) info['formats'] = formats elif video_info.get('FLVFullLengthURL') is not None: info.update({ 'url': video_info['FLVFullLengthURL'], + 'vcodec': self.FLV_VCODECS.get(video_info.get('FLVFullCodec')), + 'filesize': int_or_none(video_info.get('FLVFullSize')), }) if self._downloader.params.get('include_ads', False): @@ -386,6 +430,7 @@ class BrightcoveNewIE(InfoExtractor): 'formats': 'mincount:41', }, 'params': { + # m3u8 download 'skip_download': True, } }, { @@ -415,8 +460,8 @@ class BrightcoveNewIE(InfoExtractor): # Look for iframe embeds [1] for _, url in re.findall( - r'<iframe[^>]+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): - entries.append(url) + r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(url if url.startswith('http') else 'http:' + url) # Look for embed_in_page embeds [2] for video_id, account_id, player_id, embed in re.findall( @@ -425,11 +470,11 @@ class BrightcoveNewIE(InfoExtractor): # According to [4] data-video-id may be prefixed with ref: r'''(?sx) <video[^>]+ - data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? + data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*? </video>.*? <script[^>]+ src=["\'](?:https?:)?//players\.brightcove\.net/ - (\d+)/([\da-f-]+)_([^/]+)/index(?:\.min)?\.js + (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js ''', webpage): entries.append( 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' @@ -459,19 +504,18 @@ class BrightcoveNewIE(InfoExtractor): r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', webpage, 'policy key', group='pk') - req = sanitized_Request( - 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' - % (account_id, video_id), - headers={'Accept': 'application/json;pk=%s' % policy_key}) + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) try: - json_data = self._download_json(req, video_id) + json_data = self._download_json(api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: json_data = self._parse_json(e.cause.read().decode(), video_id) raise ExtractorError(json_data[0]['message'], expected=True) raise - title = json_data['name'] + title = json_data['name'].strip() formats = [] for source in json_data.get('sources', []): @@ -482,8 +526,11 @@ class BrightcoveNewIE(InfoExtractor): if not src: continue formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + src, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif source_type == 'application/dash+xml': + if not src: + continue + formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) else: streaming_src = source.get('streaming_src') stream_name, app_name = source.get('stream_name'), source.get('app_name') @@ -491,15 +538,23 @@ class BrightcoveNewIE(InfoExtractor): continue tbr = float_or_none(source.get('avg_bitrate'), 1000) height = int_or_none(source.get('height')) + width = int_or_none(source.get('width')) f = { 'tbr': tbr, - 'width': int_or_none(source.get('width')), - 'height': height, 'filesize': int_or_none(source.get('size')), 'container': container, - 'vcodec': source.get('codec'), - 'ext': source.get('container').lower(), + 'ext': container.lower(), } + if width == 0 and height == 0: + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': width, + 'height': height, + 'vcodec': source.get('codec'), + }) def build_format_id(kind): format_id = kind @@ -513,7 +568,7 @@ class BrightcoveNewIE(InfoExtractor): f.update({ 'url': src or streaming_src, 'format_id': build_format_id('http' if src else 'http-streaming'), - 'preference': 2 if src else 1, + 'source_preference': 0 if src else -1, }) else: f.update({ @@ -524,20 +579,22 @@ class BrightcoveNewIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - description = json_data.get('description') - thumbnail = json_data.get('thumbnail') - timestamp = parse_iso8601(json_data.get('published_at')) - duration = float_or_none(json_data.get('duration'), 1000) - tags = json_data.get('tags', []) + subtitles = {} + for text_track in json_data.get('text_tracks', []): + if text_track.get('src'): + subtitles.setdefault(text_track.get('srclang'), []).append({ + 'url': text_track['src'], + }) return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'description': json_data.get('description'), + 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'duration': float_or_none(json_data.get('duration'), 1000), + 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': account_id, 'formats': formats, - 'tags': tags, + 'subtitles': subtitles, + 'tags': json_data.get('tags', []), } diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 897f3a104..6ffbeabd3 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -16,7 +16,7 @@ from ..utils import ( class CamdemyIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' _TESTS = [{ # single file 'url': 'http://www.camdemy.com/media/5181/', @@ -104,7 +104,7 @@ class CamdemyIE(InfoExtractor): class CamdemyFolderIE(InfoExtractor): - _VALID_URL = r'http://www.camdemy.com/folder/(?P<id>\d+)' + _VALID_URL = r'https?://www.camdemy.com/folder/(?P<id>\d+)' _TESTS = [{ # links with trailing slash 'url': 'http://www.camdemy.com/folder/450', @@ -139,7 +139,7 @@ class CamdemyFolderIE(InfoExtractor): parsed_url = list(compat_urlparse.urlparse(url)) query = dict(compat_urlparse.parse_qsl(parsed_url[4])) query.update({'displayMode': 'list'}) - parsed_url[4] = compat_urllib_parse.urlencode(query) + parsed_url[4] = compat_urllib_parse_urlencode(query) final_url = compat_urlparse.urlunparse(parsed_url) page = self._download_webpage(final_url, folder_id) diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py new file mode 100644 index 000000000..afbc5ea26 --- /dev/null +++ b/youtube_dl/extractor/camwithher.py @@ -0,0 +1,87 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + unified_strdate, +) + + +class CamWithHerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)' + + _TESTS = [{ + 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', + 'info_dict': { + 'id': '5644', + 'ext': 'flv', + 'title': 'Periscope Tease', + 'description': 'In the clouds teasing on periscope to my favorite song', + 'duration': 240, + 'view_count': int, + 'comment_count': int, + 'uploader': 'MileenaK', + 'upload_date': '20160322', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + flv_id = self._html_search_regex( + r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id') + + # Video URL construction algorithm is reverse-engineered from cwhplayer.swf + rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % ( + ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id) + + title = self._html_search_regex( + r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title') + description = self._html_search_regex( + r'>Description:</span>(.+?)</div>', webpage, 'description', default=None) + + runtime = self._search_regex( + r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None) + if runtime: + runtime = re.sub(r'[\s-]', '', runtime) + duration = parse_duration(runtime) + view_count = int_or_none(self._search_regex( + r'Views\s*:\s*(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None)) + + uploader = self._search_regex( + r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None) + upload_date = unified_strdate(self._search_regex( + r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) + + return { + 'id': flv_id, + 'url': rtmp_url, + 'ext': 'flv', + 'no_resume': True, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'uploader': uploader, + 'upload_date': upload_date, + } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 40d07ab18..c621a08d5 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,24 +1,41 @@ from __future__ import unicode_literals -from .common import InfoExtractor +from .theplatform import ThePlatformIE from ..utils import ( - sanitized_Request, - smuggle_url, + xpath_text, + xpath_element, + int_or_none, + ExtractorError, + find_xpath_attr, ) -class CBSIE(InfoExtractor): +class CBSBaseIE(ThePlatformIE): + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') + return { + 'en': [{ + 'ext': 'ttml', + 'url': closed_caption_e.attrib['value'], + }] + } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] + + +class CBSIE(CBSBaseIE): _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { - 'id': '4JUVEwq3wUT7', + 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', 'display_id': 'connect-chat-feat-garth-brooks', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', 'duration': 1495, + 'timestamp': 1385585425, + 'upload_date': '20131127', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download @@ -47,22 +64,46 @@ class CBSIE(InfoExtractor): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] + TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?manifest=m3u&mbr=true' def _real_extract(self, url): display_id = self._match_id(url) - request = sanitized_Request(url) - # Android UA is served with higher quality (720p) streams (see - # https://github.com/rg3/youtube-dl/issues/7490) - request.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.4; Nexus 5)') - webpage = self._download_webpage(request, display_id) - real_id = self._search_regex( - [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"], - webpage, 'real video ID') - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url( - 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true&manifest=m3u' % real_id, - {'force_smil_url': True}), + webpage = self._download_webpage(url, display_id) + content_id = self._search_regex( + [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], + webpage, 'content id') + items_data = self._download_xml( + 'http://can.cbs.com/thunder/player/videoPlayerService.php', + content_id, query={'partner': 'cbs', 'contentId': content_id}) + video_data = xpath_element(items_data, './/item') + title = xpath_text(video_data, 'videoTitle', 'title', True) + + subtitles = {} + formats = [] + for item in items_data.findall('.//item'): + pid = xpath_text(item, 'pid') + if not pid: + continue + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + self.TP_RELEASE_URL_TEMPLATE % pid, content_id, 'Downloading %s SMIL data' % pid) + except ExtractorError: + continue + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + self._sort_formats(formats) + + info = self.get_metadata('dJ5BDC/media/guid/2198311517/%s' % content_id, content_id) + info.update({ + 'id': content_id, 'display_id': display_id, - } + 'title': title, + 'series': xpath_text(video_data, 'seriesTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), + 'thumbnail': xpath_text(video_data, 'previewImageURL'), + 'formats': formats, + 'subtitles': subtitles, + }) + return info diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cbsinteractive.py similarity index 59% rename from youtube_dl/extractor/cnet.py rename to youtube_dl/extractor/cbsinteractive.py index 3cf0bf95b..0011c3029 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import int_or_none -class CNETIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/' +class CBSInteractiveIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video/share)/(?P<id>[^/?]+)' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { @@ -17,6 +19,8 @@ class CNETIE(ThePlatformIE): 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', 'duration': 70, + 'timestamp': 1396479627, + 'upload_date': '20140402', }, }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', @@ -28,15 +32,38 @@ class CNETIE(ThePlatformIE): 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'duration': 1482, + 'timestamp': 1433289889, + 'upload_date': '20150603', }, + }, { + 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', + 'info_dict': { + 'id': 'bc1af9f0-a2b5-4e54-880d-0d95525781c0', + 'ext': 'mp4', + 'title': 'Video: Keeping Android smartphones and tablets secure', + 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', + 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', + 'uploader': 'Adrian Kingsley-Hughes', + 'timestamp': 1448961720, + 'upload_date': '20151201', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] + TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' + MPX_ACCOUNTS = { + 'cnet': 2288573011, + 'zdnet': 2387448114, + } def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"data-cnet-video(?:-uvp)?-options='([^']+)'", + r"data-(?:cnet|zdnet)-video(?:-uvp)?-options='([^']+)'", webpage, 'data json') data = self._parse_json(data_json, display_id) vdata = data.get('video') or data['videos'][0] @@ -51,16 +78,15 @@ class CNETIE(ThePlatformIE): uploader = None uploader_id = None - metadata = self.get_metadata('kYEXFC/%s' % list(vdata['files'].values())[0], video_id) - description = vdata.get('description') or metadata.get('description') - duration = int_or_none(vdata.get('duration')) or metadata.get('duration') - - formats = [] - subtitles = {} + media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) + formats, subtitles = [], {} + if site == 'cnet': + formats, subtitles = self._extract_theplatform_smil( + self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue - release_url = 'http://link.theplatform.com/s/kYEXFC/%s?format=SMIL&mbr=true' % vid + release_url = self.TP_RELEASE_URL_TEMPLATE % vid if fkey == 'hds': release_url += '&manifest=f4m' tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) @@ -68,15 +94,15 @@ class CNETIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) - return { + info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id) + info.update({ 'id': video_id, 'display_id': display_id, 'title': title, - 'description': description, - 'thumbnail': metadata.get('thumbnail'), - 'duration': duration, + 'duration': int_or_none(vdata.get('duration')), 'uploader': uploader, 'uploader_id': uploader_id, 'subtitles': subtitles, 'formats': formats, - } + }) + return info diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 7319ee1b7..79ddc20a0 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -2,16 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .theplatform import ThePlatformIE +from .cbs import CBSBaseIE from ..utils import ( parse_duration, - find_xpath_attr, ) -class CBSNewsIE(ThePlatformIE): +class CBSNewsIE(CBSBaseIE): IE_DESC = 'CBS News' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)' _TESTS = [ { @@ -49,15 +48,6 @@ class CBSNewsIE(ThePlatformIE): }, ] - def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') - return { - 'en': [{ - 'ext': 'ttml', - 'url': closed_caption_e.attrib['value'], - }] - } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] - def _real_extract(self, url): video_id = self._match_id(url) @@ -78,7 +68,7 @@ class CBSNewsIE(ThePlatformIE): pid = item.get('media' + format_id) if not pid: continue - release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?format=SMIL&mbr=true' % pid + release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) @@ -96,7 +86,7 @@ class CBSNewsIE(ThePlatformIE): class CBSNewsLiveVideoIE(InfoExtractor): IE_DESC = 'CBS News Live Videos' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)' _TEST = { 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', @@ -122,6 +112,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): for entry in f4m_formats: # URLs without the extra param induce an 404 error entry.update({'extra_param_to_segment_url': hdcore_sign}) + self._sort_formats(f4m_formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index ae47e74cc..549ae32f3 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class CBSSportsIE(InfoExtractor): - _VALID_URL = r'http://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s', diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py new file mode 100755 index 000000000..498d2c0d8 --- /dev/null +++ b/youtube_dl/extractor/cda.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + parse_duration +) + + +class CDAIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.cda.pl/video/5749950c', + 'md5': '6f844bf51b15f31fae165365707ae970', + 'info_dict': { + 'id': '5749950c', + 'ext': 'mp4', + 'height': 720, + 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'duration': 39 + } + }, { + 'url': 'http://www.cda.pl/video/57413289', + 'md5': 'a88828770a8310fc00be6c95faf7f4d5', + 'info_dict': { + 'id': '57413289', + 'ext': 'mp4', + 'title': 'Lądowanie na lotnisku na Maderze', + 'duration': 137 + } + }, { + 'url': 'http://ebd.cda.pl/0x0/5749950c', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id) + + if 'Ten film jest dostępny dla użytkowników premium' in webpage: + raise ExtractorError('This video is only available for premium users.', expected=True) + + title = self._html_search_regex(r'<title>(.+?)', webpage, 'title') + + formats = [] + + info_dict = { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': None, + } + + def extract_format(page, version): + unpacked = decode_packed_codes(page) + format_url = self._search_regex( + r"url:\\'(.+?)\\'", unpacked, '%s url' % version, fatal=False) + if not format_url: + return + f = { + 'url': format_url, + } + m = re.search( + r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', + page) + if m: + f.update({ + 'format_id': m.group('format_id'), + 'height': int(m.group('height')), + }) + info_dict['formats'].append(f) + if not info_dict['duration']: + info_dict['duration'] = parse_duration(self._search_regex( + r"duration:\\'(.+?)\\'", unpacked, 'duration', fatal=False)) + + extract_format(webpage, 'default') + + for href, resolution in re.findall( + r']+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', + webpage): + webpage = self._download_webpage( + href, video_id, 'Downloading %s version information' % resolution, fatal=False) + if not webpage: + # Manually report warning because empty page is returned when + # invalid version is requested. + self.report_warning('Unable to download %s version information' % resolution) + continue + extract_format(webpage, resolution) + + self._sort_formats(formats) + + return info_dict diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index b27b4e670..6652c8e42 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) @@ -13,6 +12,7 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, + urlencode_postdata, ) @@ -102,7 +102,7 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request( 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', - data=compat_urllib_parse.urlencode(data)) + data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') req.add_header('x-addr', '127.0.0.1') @@ -129,7 +129,8 @@ class CeskaTelevizeIE(InfoExtractor): formats = [] for format_id, stream_url in item['streamUrls'].items(): formats.extend(self._extract_m3u8_formats( - stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native')) + stream_url, playlist_id, 'mp4', + entry_protocol='m3u8_native', fatal=False)) self._sort_formats(formats) item_id = item.get('id') or item['assetId'] diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 242fba311..b2234549e 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -48,6 +48,7 @@ class ChaturbateIE(InfoExtractor): raise ExtractorError('Unable to find stream URL') formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 2996b6b09..19f8b397e 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -19,7 +19,7 @@ def _decode(s): class CliphunterIE(InfoExtractor): IE_NAME = 'cliphunter' - _VALID_URL = r'''(?x)http://(?:www\.)?cliphunter\.com/w/ + _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/ (?P[0-9]+)/ (?P.+?)(?:$|[#\?]) ''' diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index 8306d6fb7..0b6ad895f 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -8,7 +8,7 @@ from ..utils import ( class ClipsyndicateIE(InfoExtractor): - _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' + _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' _TESTS = [{ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 0fa720ee8..9e267e6c0 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_HTTPError, ) from ..utils import ( @@ -64,7 +64,7 @@ class CloudyIE(InfoExtractor): 'errorUrl': error_url, }) - data_url = self._API_URL % (video_host, compat_urllib_parse.urlencode(form)) + data_url = self._API_URL % (video_host, compat_urllib_parse_urlencode(form)) player_data = self._download_webpage( data_url, video_id, 'Downloading player data') data = compat_parse_qs(player_data) diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 1dfa7c12e..2fba93543 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -12,7 +12,7 @@ from ..utils import ( class ClubicIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P[0-9]+)\.html' _TESTS = [{ 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py new file mode 100644 index 000000000..d354d9f95 --- /dev/null +++ b/youtube_dl/extractor/cnbc.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class CNBCIE(InfoExtractor): + _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P[0-9]+)' + _TEST = { + 'url': 'http://video.cnbc.com/gallery/?video=3000503714', + 'info_dict': { + 'id': '3000503714', + 'ext': 'mp4', + 'title': 'Fighting zombies is big business', + 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', + 'timestamp': 1459332000, + 'upload_date': '20160330', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id, + {'force_smil_url': True}), + 'id': video_id, + } diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 7dff68492..747c245c8 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -11,7 +11,7 @@ from ..utils import ( class ComCarCoffIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' + _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' _TESTS = [{ 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/', 'info_dict': { @@ -41,7 +41,13 @@ class ComCarCoffIE(InfoExtractor): display_id = full_data['activeVideo']['video'] video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id] + video_id = compat_str(video_data['mediaId']) + title = video_data['title'] + formats = self._extract_m3u8_formats( + video_data['mediaUrl'], video_id, 'mp4') + self._sort_formats(formats) + thumbnails = [{ 'url': video_data['images']['thumb'], }, { @@ -54,15 +60,14 @@ class ComCarCoffIE(InfoExtractor): video_data.get('duration')) return { - '_type': 'url_transparent', - 'url': 'crackle:%s' % video_id, 'id': video_id, 'display_id': display_id, - 'title': video_data['title'], + 'title': title, 'description': video_data.get('description'), 'timestamp': timestamp, 'duration': duration, 'thumbnails': thumbnails, + 'formats': formats, 'season_number': int_or_none(video_data.get('season')), 'episode_number': int_or_none(video_data.get('episode')), 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 5b1b99675..0c59102e0 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -5,7 +5,7 @@ import re from .mtv import MTVServicesInfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, @@ -201,7 +201,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): # Correct cc.com in uri uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) - index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri})) + index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse_urlencode({'uri': uri})) idoc = self._download_xml( index_url, epTitle, 'Downloading show index', 'Unable to download episode index') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ecd7da767..ec6625eea 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,9 +21,11 @@ from ..compat import ( compat_os_name, compat_str, compat_urllib_error, - compat_urllib_parse, + compat_urllib_parse_urlencode, + compat_urllib_request, compat_urlparse, ) +from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, @@ -48,6 +50,7 @@ from ..utils import ( determine_protocol, parse_duration, mimetype2ext, + update_Request, update_url_query, ) @@ -346,7 +349,7 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): """ Returns the response handle """ if note is None: self.report_download_webpage(video_id) @@ -356,11 +359,14 @@ class InfoExtractor(object): else: self.to_screen('%s: %s' % (video_id, note)) # data, headers and query params will be ignored for `Request` objects - if isinstance(url_or_request, compat_str): + if isinstance(url_or_request, compat_urllib_request.Request): + url_or_request = update_Request( + url_or_request, data=data, headers=headers, query=query) + else: if query: url_or_request = update_url_query(url_or_request, query) if data or headers: - url_or_request = sanitized_Request(url_or_request, data, headers or {}) + url_or_request = sanitized_Request(url_or_request, data, headers) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -376,7 +382,7 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): @@ -469,7 +475,7 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): """ Returns the data of the page as a string """ success = False try_count = 0 @@ -490,7 +496,7 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None): + transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) @@ -504,7 +510,7 @@ class InfoExtractor(object): note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers=None, query=None): + fatal=True, encoding=None, data=None, headers={}, query={}): json_string = self._download_webpage( url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) @@ -862,6 +868,7 @@ class InfoExtractor(object): proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 if f.get('vcodec') == 'none': # audio only + preference -= 50 if self._downloader.params.get('prefer_free_formats'): ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] else: @@ -872,6 +879,8 @@ class InfoExtractor(object): except ValueError: audio_ext_preference = -1 else: + if f.get('acodec') == 'none': # video only + preference -= 40 if self._downloader.params.get('prefer_free_formats'): ORDER = ['flv', 'mp4', 'webm'] else: @@ -986,6 +995,11 @@ class InfoExtractor(object): if not media_nodes: manifest_version = '2.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + # Remove unsupported DRM protected media from final formats + # rendition (see https://github.com/rg3/youtube-dl/issues/8573). + media_nodes = remove_encrypted_media(media_nodes) + if not media_nodes: + return formats base_url = xpath_text( manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], 'base URL', default=None) @@ -1018,8 +1032,6 @@ class InfoExtractor(object): 'height': int_or_none(media_el.attrib.get('height')), 'preference': preference, }) - self._sort_formats(formats) - return formats def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, @@ -1140,7 +1152,6 @@ class InfoExtractor(object): last_media = None formats.append(f) last_info = {} - self._sort_formats(formats) return formats @staticmethod @@ -1297,7 +1308,7 @@ class InfoExtractor(object): 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse.urlencode(f4m_params) + f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) continue @@ -1314,8 +1325,6 @@ class InfoExtractor(object): }) continue - self._sort_formats(formats) - return formats def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): @@ -1326,7 +1335,7 @@ class InfoExtractor(object): if not src or src in urls: continue urls.append(src) - ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type')) + ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, @@ -1506,9 +1515,16 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)%(\d+)\$', r'%(\1)\2d', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])] + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth')} + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], @@ -1533,7 +1549,6 @@ class InfoExtractor(object): existing_format.update(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - self._sort_formats(formats) return formats def _live_title(self, name): diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py new file mode 100644 index 000000000..5d130a170 --- /dev/null +++ b/youtube_dl/extractor/commonprotocols.py @@ -0,0 +1,36 @@ +from __future__ import unicode_literals + +import os + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urlparse, +) +from ..utils import url_basename + + +class RtmpIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'(?i)rtmp[est]?://.+' + + _TESTS = [{ + 'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4', + 'only_matching': True, + }, { + 'url': 'rtmp://edge.live.hitbox.tv/live/dimak', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + title = compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + return { + 'id': video_id, + 'title': title, + 'formats': [{ + 'url': url, + 'ext': 'flv', + 'format_id': compat_urlparse.urlparse(url).scheme, + }], + } diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 6f92ae2ed..e8f2b5a07 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -45,7 +45,7 @@ class CondeNastIE(InfoExtractor): 'wmagazine': 'W Magazine', } - _VALID_URL = r'http://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'https?://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) EMBED_URL = r'(?:https?:)?//player\.(?P%s)\.com/(?Pembed(?:js)?)/.+?' % '|'.join(_SITES.keys()) @@ -97,7 +97,7 @@ class CondeNastIE(InfoExtractor): video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id') player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id') target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target') - data = compat_urllib_parse.urlencode({'videoId': video_id, + data = compat_urllib_parse_urlencode({'videoId': video_id, 'playerId': player_id, 'target': target, }) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index c7032ffa2..8ae3f2890 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -11,8 +11,8 @@ from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, - compat_urllib_parse, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, ) @@ -54,7 +54,7 @@ class CrunchyrollBaseIE(InfoExtractor): def _real_initialize(self): self._login() - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + def _download_webpage(self, url_or_request, *args, **kwargs): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) else sanitized_Request(url_or_request)) # Accept-Language must be set explicitly to accept any language to avoid issues @@ -65,8 +65,7 @@ class CrunchyrollBaseIE(InfoExtractor): # Crunchyroll to not work in georestriction cases in some browsers that don't place # the locale lang first in header. However allowing any language seems to workaround the issue. request.add_header('Accept-Language', '*') - return super(CrunchyrollBaseIE, self)._download_webpage( - request, video_id, note, errnote, fatal, tries, timeout, encoding) + return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) @staticmethod def _add_skip_wall(url): @@ -79,7 +78,7 @@ class CrunchyrollBaseIE(InfoExtractor): # See https://github.com/rg3/youtube-dl/issues/7202. qs['skip_wall'] = ['1'] return compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) class CrunchyrollIE(CrunchyrollBaseIE): @@ -309,7 +308,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) playerdata_req = sanitized_Request(playerdata_url) - playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) + playerdata_req.data = urlencode_postdata({'current_page': webpage_url}) playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') @@ -323,7 +322,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text streamdata_req = sanitized_Request( 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' % (stream_id, stream_format, stream_quality), - compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8')) + compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8')) streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata = self._download_xml( streamdata_req, video_id, diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index b8b9d058d..84b36f44c 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -15,7 +15,7 @@ from .senateisvp import SenateISVPIE class CSpanIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' + _VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py index 45049bf37..1622fc844 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/youtube_dl/extractor/ctsnews.py @@ -8,7 +8,7 @@ from ..utils import parse_iso8601, ExtractorError class CtsNewsIE(InfoExtractor): IE_DESC = '華視新聞' # https connection failed (Connection reset) - _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P\d+)\.html' + _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P\d+)\.html' _TESTS = [{ 'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html', 'md5': 'a9875cb790252b08431186d741beaabe', diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index 36af67013..f5cefd966 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -57,6 +57,7 @@ class CWTVIE(InfoExtractor): formats = self._extract_m3u8_formats( video_data['videos']['variantplaylist']['uri'], video_id, 'mp4') + self._sort_formats(formats) thumbnails = [{ 'url': image['uri'], diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index c84c51058..86024a745 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -8,8 +8,8 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -70,7 +70,7 @@ class DaumIE(InfoExtractor): def _real_extract(self, url): video_id = compat_urllib_parse_unquote(self._match_id(url)) - query = compat_urllib_parse.urlencode({'vid': video_id}) + query = compat_urllib_parse_urlencode({'vid': video_id}) movie_data = self._download_json( 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query, video_id, 'Downloading video formats info') @@ -86,7 +86,7 @@ class DaumIE(InfoExtractor): formats = [] for format_el in movie_data['output_list']['output_list']: profile = format_el['profile'] - format_query = compat_urllib_parse.urlencode({ + format_query = compat_urllib_parse_urlencode({ 'vid': video_id, 'profile': profile, }) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 15a1c40f7..5deff5f30 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -6,7 +6,7 @@ import base64 from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_str, ) from ..utils import ( @@ -15,6 +15,7 @@ from ..utils import ( sanitized_Request, smuggle_url, unsmuggle_url, + urlencode_postdata, ) @@ -106,7 +107,7 @@ class DCNVideoIE(DCNBaseIE): webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'id': video_data['id'], 'user_id': video_data['user_id'], 'signature': video_data['signature'], @@ -133,7 +134,7 @@ class DCNLiveIE(DCNBaseIE): webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'id': base64.b64encode(channel_data['user_id'].encode()).decode(), 'channelid': base64.b64encode(channel_data['id'].encode()).decode(), 'signature': channel_data['signature'], @@ -174,7 +175,7 @@ class DCNSeasonIE(InfoExtractor): data['show_id'] = show_id request = sanitized_Request( 'http://admin.mangomolo.com/analytics/index.php/plus/show', - compat_urllib_parse.urlencode(data), + urlencode_postdata(data), { 'Origin': 'http://www.dcndigital.ae', 'Content-Type': 'application/x-www-form-urlencoded' diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index aa2c09eb6..9099f5046 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -6,7 +6,7 @@ from ..compat import compat_str class DctpTvIE(InfoExtractor): - _VALID_URL = r'http://www.dctp.tv/(#/)?filme/(?P.+?)/$' + _VALID_URL = r'https?://www.dctp.tv/(#/)?filme/(?P.+?)/$' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py index 98e3aedfd..9fe144e14 100644 --- a/youtube_dl/extractor/defense.py +++ b/youtube_dl/extractor/defense.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class DefenseGouvFrIE(InfoExtractor): IE_NAME = 'defense.gouv.fr' - _VALID_URL = r'http://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P[^/?#]*)' + _VALID_URL = r'https?://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P[^/?#]*)' _TEST = { 'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1', diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index 263532cc6..cdfeccacb 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -38,6 +38,7 @@ class DFBIE(InfoExtractor): token_el = f4m_info.find('token') manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' formats = self._extract_f4m_formats(manifest_url, display_id) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index ce680a9f3..5f1275b39 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -9,7 +9,7 @@ from ..compat import compat_str class DiscoveryIE(InfoExtractor): - _VALID_URL = r'''(?x)http://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: discovery| investigationdiscovery| discoverylife| @@ -63,18 +63,23 @@ class DiscoveryIE(InfoExtractor): video_title = info.get('playlist_title') or info.get('video_title') - entries = [{ - 'id': compat_str(video_info['id']), - 'formats': self._extract_m3u8_formats( + entries = [] + + for idx, video_info in enumerate(info['playlist']): + formats = self._extract_m3u8_formats( video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls', - note='Download m3u8 information for video %d' % (idx + 1)), - 'title': video_info['title'], - 'description': video_info.get('description'), - 'duration': parse_duration(video_info.get('video_length')), - 'webpage_url': video_info.get('href') or video_info.get('url'), - 'thumbnail': video_info.get('thumbnailURL'), - 'alt_title': video_info.get('secondary_title'), - 'timestamp': parse_iso8601(video_info.get('publishedDate')), - } for idx, video_info in enumerate(info['playlist'])] + note='Download m3u8 information for video %d' % (idx + 1)) + self._sort_formats(formats) + entries.append({ + 'id': compat_str(video_info['id']), + 'formats': formats, + 'title': video_info['title'], + 'description': video_info.get('description'), + 'duration': parse_duration(video_info.get('video_length')), + 'webpage_url': video_info.get('href') or video_info.get('url'), + 'thumbnail': video_info.get('thumbnailURL'), + 'alt_title': video_info.get('secondary_title'), + 'timestamp': parse_iso8601(video_info.get('publishedDate')), + }) return self.playlist_result(entries, display_id, video_title) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index bdc768c78..3915cb182 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -10,7 +10,7 @@ from ..compat import (compat_str, compat_basestring) class DouyuTVIE(InfoExtractor): IE_DESC = '斗鱼' - _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { @@ -60,6 +60,9 @@ class DouyuTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://www.douyu.com/xiaocang', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index a638c827c..66bbfc6ca 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -10,7 +10,7 @@ from ..utils import int_or_none class DPlayIE(InfoExtractor): - _VALID_URL = r'http://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', @@ -118,6 +118,8 @@ class DPlayIE(InfoExtractor): if info.get(protocol): extract_formats(protocol, info[protocol]) + self._sort_formats(formats) + return { 'id': video_id, 'display_id': display_id, diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index d35e88881..3b6529f4b 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -6,7 +6,6 @@ import itertools from .amp import AMPIE from ..compat import ( compat_HTTPError, - compat_urllib_parse, compat_urlparse, ) from ..utils import ( @@ -14,6 +13,7 @@ from ..utils import ( clean_html, int_or_none, sanitized_Request, + urlencode_postdata ) @@ -50,7 +50,7 @@ class DramaFeverBaseIE(AMPIE): } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) response = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 028144f20..0040e70d4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -7,7 +7,7 @@ from .zdf import ZDFIE class DreiSatIE(ZDFIE): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' + _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' _TESTS = [ { 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index c1a4bc757..974c69dbc 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -15,7 +15,7 @@ class DVTVIE(InfoExtractor): IE_NAME = 'dvtv' IE_DESC = 'http://video.aktualne.cz/' - _VALID_URL = r'http://video\.aktualne\.cz/(?:[^/]+/)+r~(?P[0-9a-f]{32})' + _VALID_URL = r'https?://video\.aktualne\.cz/(?:[^/]+/)+r~(?P[0-9a-f]{32})' _TESTS = [{ 'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/', diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py index b6c985547..ae7c571bd 100644 --- a/youtube_dl/extractor/dw.py +++ b/youtube_dl/extractor/dw.py @@ -39,13 +39,13 @@ class DWIE(InfoExtractor): hidden_inputs = self._hidden_inputs(webpage) title = hidden_inputs['media_title'] - formats = [] if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': formats = self._extract_smil_formats( 'http://www.dw.com/smil/v-%s' % media_id, media_id, transform_source=lambda s: s.replace( 'rtmp://tv-od.dw.de/flash/', 'http://tv-download.dw.de/dwtv_video/flv/')) + self._sort_formats(formats) else: formats = [{'url': hidden_inputs['file_name']}] diff --git a/youtube_dl/extractor/echomsk.py b/youtube_dl/extractor/echomsk.py index d2d94049d..6b7cc652f 100644 --- a/youtube_dl/extractor/echomsk.py +++ b/youtube_dl/extractor/echomsk.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class EchoMskIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?echo\.msk\.ru/sounds/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?echo\.msk\.ru/sounds/(?P\d+)' _TEST = { 'url': 'http://www.echo.msk.ru/sounds/1464134.html', 'md5': '2e44b3b78daff5b458e4dbc37f191f7c', diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 7fcd0151d..297f8a6f5 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, unescapeHTML @@ -43,7 +43,7 @@ class EroProfileIE(InfoExtractor): if username is None: return - query = compat_urllib_parse.urlencode({ + query = compat_urllib_parse_urlencode({ 'username': username, 'password': password, 'url': 'http://www.eroprofile.com/', diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index 0c0fe6d65..09ed4f2b5 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class ExfmIE(InfoExtractor): IE_NAME = 'exfm' IE_DESC = 'ex.fm' - _VALID_URL = r'http://(?:www\.)?ex\.fm/song/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?ex\.fm/song/(?P[^/]+)' _SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' _TESTS = [ { diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 9580f5c0c..c7d69ff1f 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -5,19 +5,18 @@ import hashlib from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_urllib_request, compat_urlparse, ) from ..utils import ( - encode_dict, ExtractorError, sanitized_Request, + urlencode_postdata, ) class FC2IE(InfoExtractor): - _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' + _VALID_URL = r'^https?://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ @@ -57,7 +56,7 @@ class FC2IE(InfoExtractor): 'Submit': ' Login ', } - login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8') + login_data = urlencode_postdata(login_form_strs) request = sanitized_Request( 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py index 298227d57..e8936cb24 100644 --- a/youtube_dl/extractor/firstpost.py +++ b/youtube_dl/extractor/firstpost.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class FirstpostIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' _TEST = { 'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html', diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 510d4b108..98b165143 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -8,7 +8,7 @@ from ..utils import int_or_none class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'http://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P.+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P.+)' _TESTS = [{ 'url': 'http://www.1tv.ru/videoarchive/73390', diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 67d50a386..6b8345416 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -4,8 +4,8 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_parse_qs, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -109,7 +109,7 @@ class FiveMinIE(InfoExtractor): response = self._download_json( 'https://syn.5min.com/handlers/SenseHandler.ashx?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'func': 'GetResults', 'playlist': video_id, 'sid': sid, diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 5f6e65dae..a3a291599 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -10,7 +10,7 @@ from ..utils import ( class FKTVIE(InfoExtractor): IE_NAME = 'fernsehkritik.tv' - _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P[0-9]+)(?:/.*)?' + _VALID_URL = r'https?://(?:www\.)?fernsehkritik\.tv/folge-(?P[0-9]+)(?:/.*)?' _TEST = { 'url': 'http://fernsehkritik.tv/folge-1', diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 18f439df9..0a3de1498 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, int_or_none, @@ -42,7 +42,7 @@ class FlickrIE(InfoExtractor): } if secret: query['secret'] = secret - data = self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) + data = self._download_json(self._API_BASE_URL + compat_urllib_parse_urlencode(query), video_id, note) if data['stat'] != 'ok': raise ExtractorError(data['message']) return data diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py index 370fd006f..d2503ae2e 100644 --- a/youtube_dl/extractor/footyroom.py +++ b/youtube_dl/extractor/footyroom.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class FootyRoomIE(InfoExtractor): - _VALID_URL = r'http://footyroom\.com/(?P[^/]+)' + _VALID_URL = r'https?://footyroom\.com/(?P[^/]+)' _TESTS = [{ 'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/', 'info_dict': { diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index fa05af50d..95c1abf94 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -16,6 +16,9 @@ class FOXIE(InfoExtractor): 'title': 'Official Trailer: Gotham', 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', 'duration': 129, + 'timestamp': 1400020798, + 'upload_date': '20140513', + 'uploader': 'NEWA-FNG-FOXCOM', }, 'add_ie': ['ThePlatform'], } diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index 08b8ea362..70c1a815d 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class FoxgayIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' + _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' _TEST = { 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml', 'md5': '80d72beab5d04e1655a56ad37afe6841', diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 1dc50318c..b04da2415 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -18,8 +18,8 @@ class FoxNewsIE(AMPIE): 'title': 'Frozen in Time', 'description': '16-year-old girl is size of toddler', 'duration': 265, - # 'timestamp': 1304411491, - # 'upload_date': '20110503', + 'timestamp': 1304411491, + 'upload_date': '20110503', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -32,8 +32,8 @@ class FoxNewsIE(AMPIE): 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", 'description': "Congressman discusses president's plan", 'duration': 292, - # 'timestamp': 1417662047, - # 'upload_date': '20141204', + 'timestamp': 1417662047, + 'upload_date': '20141204', 'thumbnail': 're:^https?://.*\.jpg$', }, 'params': { diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 0388ba00c..2369f868d 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class FranceInterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P[0-9]+)' _TEST = { 'url': 'http://www.franceinter.fr/player/reecouter?play=793962', 'md5': '4764932e466e6f6c79c317d2e74f6884', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 3f4ac3093..ad94e31f3 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -60,28 +60,31 @@ class FranceTVBaseInfoExtractor(InfoExtractor): video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: formats.extend(self._extract_f4m_formats( - f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id)) + f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', + video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id)) + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, 'format_id': 'rtmp-%s' % format_id, 'ext': 'flv', - 'preference': 1, }) else: - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'preference': -1, - }) + if self._is_valid_url(video_url, video_id, format_id): + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) self._sort_formats(formats) title = info['titre'] subtitle = info.get('sous_titre') if subtitle: title += ' - %s' % subtitle + title = title.strip() subtitles = {} subtitles_list = [{ @@ -125,13 +128,13 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile)\.francetvinfo\.fr/.*/(?P.+)\.html' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P<title>.+)\.html' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', 'info_dict': { 'id': '84981923', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Soir 3', 'upload_date': '20130826', 'timestamp': 1377548400, @@ -139,6 +142,10 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'fr': 'mincount:2', }, }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', 'info_dict': { @@ -155,11 +162,32 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', 'md5': 'f485bda6e185e7d15dbc69b72bae993e', 'info_dict': { - 'id': '556e03339473995ee145930c', + 'id': 'NI_173343', 'ext': 'mp4', 'title': 'Les entreprises familiales : le secret de la réussite', 'thumbnail': 're:^https?://.*\.jpe?g$', - } + 'timestamp': 1433273139, + 'upload_date': '20150602', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, { + 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html', + 'md5': 'f485bda6e185e7d15dbc69b72bae993e', + 'info_dict': { + 'id': 'NI_657393', + 'ext': 'mp4', + 'title': 'Olivier Monthus, réalisateur de "Bretagne, le choix de l’Armor"', + 'description': 'md5:a3264114c9d29aeca11ced113c37b16c', + 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1458300695, + 'upload_date': '20160318', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -172,7 +200,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self.url_result(dmcloud_url, 'DailymotionCloud') video_id, catalogue = self._search_regex( - r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@') + (r'id-video=([^@]+@[^"]+)', + r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + webpage, 'video id').split('@') return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/freevideo.py b/youtube_dl/extractor/freevideo.py index c7bec027b..cd8423a6f 100644 --- a/youtube_dl/extractor/freevideo.py +++ b/youtube_dl/extractor/freevideo.py @@ -5,7 +5,7 @@ from ..utils import ExtractorError class FreeVideoIE(InfoExtractor): - _VALID_URL = r'^http://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' + _VALID_URL = r'^https?://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' _TEST = { 'url': 'http://www.freevideo.cz/vase-videa/vysukany-zadecek-22033.html', diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 0f37ed786..1eb528f31 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from ..utils import ( clean_html, determine_ext, - encode_dict, int_or_none, sanitized_Request, ExtractorError, @@ -54,10 +53,10 @@ class FunimationIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: return - data = urlencode_postdata(encode_dict({ + data = urlencode_postdata({ 'email_field': username, 'password_field': password, - })) + }) login_request = sanitized_Request('http://www.funimation.com/login', data, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', 'Content-Type': 'application/x-www-form-urlencoded' diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index f6b9046f9..cbcddcb7c 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -10,7 +10,7 @@ from .youtube import YoutubeIE class GamekingsIE(InfoExtractor): - _VALID_URL = r'http://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' _TESTS = [{ # YouTube embed video 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index b3f1bafcc..4ffdd7515 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index 590ccf526..69058a583 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -13,7 +13,7 @@ from ..utils import ( class GameStarIE(InfoExtractor): - _VALID_URL = r'http://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' _TEST = { 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', 'md5': '96974ecbb7fd8d0d20fca5a00810cea7', diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index c3f031d9c..1e7948ab8 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -9,7 +9,7 @@ from ..utils import ( class GametrailersIE(InfoExtractor): - _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' + _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' _TEST = { 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 3befd3e7b..59ed4c38f 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -3,11 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( remove_end, HEADRequest, sanitized_Request, + urlencode_postdata, ) @@ -123,7 +123,7 @@ class GDCVaultIE(InfoExtractor): 'password': password, } - request = sanitized_Request(login_url, compat_urllib_parse.urlencode(login_form)) + request = sanitized_Request(login_url, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self._download_webpage(request, display_id, 'Logging in') start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8121f04a5..589d1e152 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -59,6 +59,7 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .instagram import InstagramIE class GenericIE(InfoExtractor): @@ -239,6 +240,35 @@ class GenericIE(InfoExtractor): 'format': 'bestvideo', }, }, + # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 + { + 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', + 'info_dict': { + 'id': 'content', + 'ext': 'mp4', + 'title': 'content', + 'formats': 'mincount:8', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, + # m3u8 served with Content-Type: text/plain + { + 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', + 'info_dict': { + 'id': 'index', + 'ext': 'mp4', + 'title': 'index', + 'upload_date': '20140720', + 'formats': 'mincount:11', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', @@ -376,19 +406,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - # multiple ooyala embeds on SBN network websites - { - 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', - 'info_dict': { - 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok', - 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com', - }, - 'playlist_mincount': 3, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -1094,7 +1111,23 @@ class GenericIE(InfoExtractor): # m3u8 downloads 'skip_download': True, } - } + }, + # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' + # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm + { + 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', + 'info_dict': { + 'id': '4785848093001', + 'ext': 'mp4', + 'title': 'The Cardinal Pell Interview', + 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', + 'uploader': 'GlobeCast Australia - GlobeStream', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, ] def report_following_redirect(self, new_url): @@ -1245,14 +1278,13 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) } # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '') - m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type) + content_type = head_response.headers.get('Content-Type', '').lower() + m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) format_id = m.group('format_id') if format_id.endswith('mpegurl'): formats = self._extract_m3u8_formats(url, video_id, 'mp4') @@ -1264,11 +1296,9 @@ class GenericIE(InfoExtractor): 'url': url, 'vcodec': 'none' if m.group('type') == 'audio' else None }] - info_dict.update({ - 'direct': True, - 'formats': formats, - 'upload_date': upload_date, - }) + info_dict['direct'] = True + self._sort_formats(formats) + info_dict['formats'] = formats return info_dict if not self._downloader.params.get('test', False) and not is_intentional: @@ -1289,18 +1319,22 @@ class GenericIE(InfoExtractor): request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) + first_bytes = full_response.read(512) + + # Is it an M3U playlist? + if first_bytes.startswith(b'#EXTM3U'): + info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') + self._sort_formats(info_dict['formats']) + return info_dict + # Maybe it's a direct link to a video? # Be careful not to download the whole thing! - first_bytes = full_response.read(512) if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) info_dict.update({ 'direct': True, 'url': url, - 'upload_date': upload_date, }) return info_dict @@ -1315,15 +1349,19 @@ class GenericIE(InfoExtractor): if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): - return self._parse_smil(doc, url, video_id) + smil = self._parse_smil(doc, url, video_id) + self._sort_formats(smil['formats']) + return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, video_id, mpd_base_url=url.rpartition('/')[0]) + self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + self._sort_formats(info_dict['formats']) return info_dict except compat_xml_parse_error: pass @@ -1881,6 +1919,19 @@ class GenericIE(InfoExtractor): self._proto_relative_url(unescapeHTML(mobj.group(1))), 'AdobeTVVideo') + # Look for Vine embeds + mobj = re.search( + r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + + # Look for Instagram embeds + instagram_embed_url = InstagramIE._extract_embed_url(webpage) + if instagram_embed_url is not None: + return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -1995,6 +2046,9 @@ class GenericIE(InfoExtractor): else: entry_info_dict['url'] = video_url + if entry_info_dict.get('formats'): + self._sort_formats(entry_info_dict['formats']) + entries.append(entry_info_dict) if len(entries) == 1: diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py new file mode 100644 index 000000000..dad0f3994 --- /dev/null +++ b/youtube_dl/extractor/hbo.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + xpath_element, + int_or_none, + parse_duration, +) + + +class HBOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', + 'md5': '1c33253f0c7782142c993c0ba62a8753', + 'info_dict': { + 'id': '1437839', + 'ext': 'mp4', + 'title': 'Ep. 64 Clip: Encryption', + } + } + _FORMATS_INFO = { + '1920': { + 'width': 1280, + 'height': 720, + }, + '640': { + 'width': 768, + 'height': 432, + }, + 'highwifi': { + 'width': 640, + 'height': 360, + }, + 'high3g': { + 'width': 640, + 'height': 360, + }, + 'medwifi': { + 'width': 400, + 'height': 224, + }, + 'med3g': { + 'width': 400, + 'height': 224, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_xml( + 'http://render.lv3.hbo.com/data/content/global/videos/data/%s.xml' % video_id, video_id) + title = xpath_text(video_data, 'title', 'title', True) + + formats = [] + for source in xpath_element(video_data, 'videos', 'sources', True): + if source.tag == 'size': + path = xpath_text(source, './/path') + if not path: + continue + width = source.attrib.get('width') + format_info = self._FORMATS_INFO.get(width, {}) + height = format_info.get('height') + fmt = { + 'url': path, + 'format_id': 'http%s' % ('-%dp' % height if height else ''), + 'width': format_info.get('width'), + 'height': height, + } + rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', path) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': fmt['format_id'].replace('http', 'rtmp'), + }) + formats.append(fmt) + else: + video_url = source.text + if not video_url: + continue + if source.tag == 'tarball': + formats.extend(self._extract_m3u8_formats( + video_url.replace('.tar', '/base_index_w8.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + format_info = self._FORMATS_INFO.get(source.tag, {}) + formats.append({ + 'format_id': 'http-%s' % source.tag, + 'url': video_url, + 'width': format_info.get('width'), + 'height': format_info.get('height'), + }) + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + + thumbnails = [] + card_sizes = xpath_element(video_data, 'titleCardSizes') + if card_sizes is not None: + for size in card_sizes: + path = xpath_text(size, 'path') + if not path: + continue + width = int_or_none(size.get('width')) + thumbnails.append({ + 'id': width, + 'url': path, + 'width': width, + }) + + return { + 'id': video_id, + 'title': title, + 'duration': parse_duration(xpath_element(video_data, 'duration/tv14')), + 'formats': formats, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 31e219945..9db565209 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -3,16 +3,16 @@ from __future__ import unicode_literals import base64 from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, HEADRequest, sanitized_Request, + urlencode_postdata, ) class HotNewHipHopIE(InfoExtractor): - _VALID_URL = r'http://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html' + _VALID_URL = r'https?://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html' _TEST = { 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', 'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96', @@ -35,7 +35,7 @@ class HotNewHipHopIE(InfoExtractor): r'"contentUrl" content="(.*?)"', webpage, 'content URL') return self.url_result(video_url, ie='Youtube') - reqdata = compat_urllib_parse.urlencode([ + reqdata = urlencode_postdata([ ('mediaType', 's'), ('mediaId', video_id), ]) diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index 663e6632a..76b74c51d 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -6,6 +6,7 @@ from ..utils import ( int_or_none, js_to_json, unescapeHTML, + determine_ext, ) @@ -39,7 +40,7 @@ class HowStuffWorksIE(InfoExtractor): 'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm', 'info_dict': { 'id': '440011', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Sword Swallowing #1 by Dan Meyer', 'description': 'Video footage (1 of 3) used by permission of the owner Dan Meyer through Sword Swallowers Association International <www.swordswallow.org>', 'display_id': 'sword-swallowing-1-by-dan-meyer', @@ -63,13 +64,19 @@ class HowStuffWorksIE(InfoExtractor): video_id = clip_info['content_id'] formats = [] m3u8_url = clip_info.get('m3u8') - if m3u8_url: - formats += self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + if m3u8_url and determine_ext(m3u8_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', format_id='hls', fatal=True)) + flv_url = clip_info.get('flv_url') + if flv_url: + formats.append({ + 'url': flv_url, + 'format_id': 'flv', + }) for video in clip_info.get('mp4', []): formats.append({ 'url': video['src'], - 'format_id': video['bitrate'], - 'vbr': int(video['bitrate'].rstrip('k')), + 'format_id': 'mp4-%s' % video['bitrate'], + 'vbr': int_or_none(video['bitrate'].rstrip('k')), }) if not formats: @@ -102,6 +109,6 @@ class HowStuffWorksIE(InfoExtractor): 'title': unescapeHTML(clip_info['clip_title']), 'description': unescapeHTML(clip_info.get('caption')), 'thumbnail': clip_info.get('video_still_url'), - 'duration': clip_info.get('duration'), + 'duration': int_or_none(clip_info.get('duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index b3706fe6d..f7c913054 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -4,7 +4,7 @@ import json import time from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, sanitized_Request, @@ -12,7 +12,7 @@ from ..utils import ( class HypemIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/' _TEST = { 'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME', 'md5': 'b9cc91b5af8995e9f0c1cee04c575828', @@ -28,7 +28,7 @@ class HypemIE(InfoExtractor): track_id = self._match_id(url) data = {'ax': 1, 'ts': time.time()} - request = sanitized_Request(url + '?' + compat_urllib_parse.urlencode(data)) + request = sanitized_Request(url + '?' + compat_urllib_parse_urlencode(data)) response, urlh = self._download_webpage_handle( request, track_id, 'Downloading webpage with the url') diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index b61b2dc4e..8bed8ccd0 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -12,7 +12,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' _TEST = { 'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -70,7 +70,7 @@ class ImdbIE(InfoExtractor): class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' _TEST = { 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', 'info_dict': { diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index ed3e07118..11bb58d8a 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -4,8 +4,10 @@ import re from .common import InfoExtractor from ..utils import ( + get_element_by_attribute, int_or_none, limit_length, + lowercase_escape, ) @@ -38,6 +40,18 @@ class InstagramIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_embed_url(webpage): + blockquote_el = get_element_by_attribute( + 'class', 'instagram-media', webpage) + if blockquote_el is None: + return + + mobj = re.search( + r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) + if mobj: + return mobj.group('link') + def _real_extract(self, url): video_id = self._match_id(url) @@ -46,6 +60,8 @@ class InstagramIE(InfoExtractor): webpage, 'uploader id', fatal=False) desc = self._search_regex( r'"caption":"(.+?)"', webpage, 'description', default=None) + if desc is not None: + desc = lowercase_escape(desc) return { 'id': video_id, @@ -136,7 +152,7 @@ class InstagramUserIE(InfoExtractor): if not page['items']: break - max_id = page['items'][-1]['id'] + max_id = page['items'][-1]['id'].split('_')[0] media_url = ( 'http://instagram.com/%s/media?max_id=%s' % ( uploader_id, max_id)) diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 483cc6f9e..e60145b3d 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_urlparse, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( xpath_with_ns, @@ -38,7 +38,7 @@ class InternetVideoArchiveIE(InfoExtractor): # Other player ids return m3u8 urls cleaned_dic['playerid'] = '247' cleaned_dic['videokbrate'] = '100000' - return compat_urllib_parse.urlencode(cleaned_dic) + return compat_urllib_parse_urlencode(cleaned_dic) def _real_extract(self, url): query = compat_urlparse.urlparse(url).query diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 61a0de472..788bbe0d5 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -6,6 +6,8 @@ import time from .common import InfoExtractor from ..utils import ( + determine_ext, + js_to_json, sanitized_Request, ) @@ -30,8 +32,7 @@ class IPrimaIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -43,9 +44,42 @@ class IPrimaIE(InfoExtractor): req.add_header('Referer', url) playerpage = self._download_webpage(req, video_id, note='Downloading player') - m3u8_url = self._search_regex(r"'src': '([^']+\.m3u8)'", playerpage, 'm3u8 url') + formats = [] - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + def extract_formats(format_url, format_key=None, lang=None): + ext = determine_ext(format_url) + new_formats = [] + if format_key == 'hls' or ext == 'm3u8': + new_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + elif format_key == 'dash' or ext == 'mpd': + return + new_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) + if lang: + for f in new_formats: + if not f.get('language'): + f['language'] = lang + formats.extend(new_formats) + + options = self._parse_json( + self._search_regex( + r'(?s)var\s+playerOptions\s*=\s*({.+?});', + playerpage, 'player options', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if options: + for key, tracks in options.get('tracks', {}).items(): + if not isinstance(tracks, list): + continue + for track in tracks: + src = track.get('src') + if src: + extract_formats(src, key.lower(), track.get('lang')) + + if not formats: + for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage): + extract_formats(src) self._sort_formats(formats) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index e7c0cb3f6..9e8c9432a 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -14,7 +14,7 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -165,7 +165,7 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html' + _VALID_URL = r'https?://(?:[^.]+\.)?iqiyi\.com/.+\.html' _NETRC_MACHINE = 'iqiyi' @@ -322,7 +322,7 @@ class IqiyiIE(InfoExtractor): 'bird_t': timestamp, } validation_result = self._download_json( - 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None, + 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse_urlencode(validation_params), None, note='Validate credentials', errnote='Unable to validate credentials') MSG_MAP = { @@ -456,7 +456,7 @@ class IqiyiIE(InfoExtractor): 'QY00001': auth_result['data']['u'], }) api_video_url += '?' if '?' not in api_video_url else '&' - api_video_url += compat_urllib_parse.urlencode(param) + api_video_url += compat_urllib_parse_urlencode(param) js = self._download_json( api_video_url, video_id, note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) @@ -494,14 +494,14 @@ class IqiyiIE(InfoExtractor): } api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ - compat_urllib_parse.urlencode(param) + compat_urllib_parse_urlencode(param) raw_data = self._download_json(api_url, video_id) return raw_data def get_enc_key(self, video_id): # TODO: automatic key extraction # last update at 2016-01-22 for Zombie::bite - enc_key = '8ed797d224d043e7ac23d95b70227d32' + enc_key = '4a1caba4b4465345366f28da7c117d20' return enc_key def _extract_playlist(self, webpage): diff --git a/youtube_dl/extractor/ivideon.py b/youtube_dl/extractor/ivideon.py index 617dc8c07..3ca824f79 100644 --- a/youtube_dl/extractor/ivideon.py +++ b/youtube_dl/extractor/ivideon.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import qualities @@ -62,7 +62,7 @@ class IvideonIE(InfoExtractor): quality = qualities(self._QUALITIES) formats = [{ - 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse.urlencode({ + 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse_urlencode({ 'server': server_id, 'camera': camera_id, 'sessionId': 'demo', diff --git a/youtube_dl/extractor/jadorecettepub.py b/youtube_dl/extractor/jadorecettepub.py index 063e86de4..158c09a33 100644 --- a/youtube_dl/extractor/jadorecettepub.py +++ b/youtube_dl/extractor/jadorecettepub.py @@ -9,7 +9,7 @@ from .youtube import YoutubeIE class JadoreCettePubIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html' + _VALID_URL = r'https?://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html' _TEST = { 'url': 'http://www.jadorecettepub.com/2010/12/star-wars-massacre-par-les-japonais.html', diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 137db873c..1a4227f6b 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class JeuxVideoIE(InfoExtractor): - _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' + _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' _TESTS = [{ 'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 44d7c84a1..a65697ff5 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -6,7 +6,7 @@ import base64 from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, compat_parse_qs, ) @@ -71,7 +71,7 @@ class KalturaIE(InfoExtractor): for k, v in a.items(): params['%d:%s' % (i, k)] = v - query = compat_urllib_parse.urlencode(params) + query = compat_urllib_parse_urlencode(params) url = self._API_BASE + query data = self._download_json(url, video_id, *args, **kwargs) diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index 06daf5a89..b4c30b7f3 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -9,7 +9,7 @@ from ..utils import ( class KaraoketvIE(InfoExtractor): - _VALID_URL = r'http://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)' _TEST = { 'url': 'http://karaoketv.co.il/?container=songs&id=171568', 'info_dict': { diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index bed94bc93..2cb04e533 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -12,7 +12,7 @@ from ..utils import ( class KarriereVideosIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', 'info_dict': { diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index a59c529f4..704bd7b34 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -13,7 +13,7 @@ from ..utils import ( class KontrTubeIE(InfoExtractor): IE_NAME = 'kontrtube' IE_DESC = 'KontrTube.ru - Труба зовёт' - _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/' _TEST = { 'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/', diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py index a602980a1..a574408e5 100644 --- a/youtube_dl/extractor/ku6.py +++ b/youtube_dl/extractor/ku6.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class Ku6IE(InfoExtractor): - _VALID_URL = r'http://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html' + _VALID_URL = r'https?://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html' _TEST = { 'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html', 'md5': '01203549b9efbb45f4b87d55bdea1ed1', diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py index 931f34c9b..12cc56e44 100644 --- a/youtube_dl/extractor/kusi.py +++ b/youtube_dl/extractor/kusi.py @@ -16,7 +16,7 @@ from ..utils import ( class KUSIIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' + _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' _TESTS = [{ 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', 'md5': 'f926e7684294cf8cb7bdf8858e1b3988', diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 700e44b63..86c17c931 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -import itertools from .common import InfoExtractor from ..utils import ( get_element_by_id, clean_html, ExtractorError, + InAdvancePagedList, remove_start, ) @@ -23,16 +23,29 @@ class KuwoBaseIE(InfoExtractor): {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} ] - def _get_formats(self, song_id): + def _get_formats(self, song_id, tolerate_ip_deny=False): formats = [] for file_format in self._FORMATS: + headers = {} + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + headers['Ytdl-request-proxy'] = cn_verification_proxy + + query = { + 'format': file_format['ext'], + 'br': file_format.get('br', ''), + 'rid': 'MUSIC_%s' % song_id, + 'type': 'convert_url', + 'response': 'url' + } + song_url = self._download_webpage( - 'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' % - (file_format['ext'], file_format.get('br', ''), song_id), + 'http://antiserver.kuwo.cn/anti.s', song_id, note='Download %s url info' % file_format['format'], + query=query, headers=headers, ) - if song_url == 'IPDeny': + if song_url == 'IPDeny' and not tolerate_ip_deny: raise ExtractorError('This song is blocked in this region', expected=True) if song_url.startswith('http://') or song_url.startswith('https://'): @@ -43,14 +56,14 @@ class KuwoBaseIE(InfoExtractor): 'preference': file_format['preference'], 'abr': file_format.get('abr'), }) - self._sort_formats(formats) + return formats class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' - _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/yinyue/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { @@ -75,6 +88,9 @@ class KuwoIE(KuwoBaseIE): 'params': { 'format': 'mp3-320' }, + }, { + 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', + 'only_matching': True, }] def _real_extract(self, url): @@ -95,6 +111,7 @@ class KuwoIE(KuwoBaseIE): lrc_content = None formats = self._get_formats(song_id) + self._sort_formats(formats) album_id = self._html_search_regex( r'<p[^>]+class="album"[^<]+<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"', @@ -126,7 +143,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' IE_DESC = '酷我音乐 - 专辑' - _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/album/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', 'info_dict': { @@ -162,13 +179,11 @@ class KuwoAlbumIE(InfoExtractor): class KuwoChartIE(InfoExtractor): IE_NAME = 'kuwo:chart' IE_DESC = '酷我音乐 - 排行榜' - _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 'info_dict': { 'id': '香港中文龙虎榜', - 'title': '香港中文龙虎榜', - 'description': 're:\d{4}第\d{2}期', }, 'playlist_mincount': 10, } @@ -179,30 +194,24 @@ class KuwoChartIE(InfoExtractor): url, chart_id, note='Download chart info', errnote='Unable to get chart info') - chart_name = self._html_search_regex( - r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name') - - chart_desc = self._html_search_regex( - r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc') - entries = [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage) + r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) ] - return self.playlist_result(entries, chart_id, chart_name, chart_desc) + return self.playlist_result(entries, chart_id) class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' IE_DESC = '酷我音乐 - 歌手' - _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 'info_dict': { 'id': 'bruno+mars', 'title': 'Bruno Mars', }, - 'playlist_count': 10, + 'playlist_mincount': 329, }, { 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', 'info_dict': { @@ -213,6 +222,8 @@ class KuwoSingerIE(InfoExtractor): 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540 }] + PAGE_SIZE = 15 + def _real_extract(self, url): singer_id = self._match_id(url) webpage = self._download_webpage( @@ -220,25 +231,28 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get singer info') singer_name = self._html_search_regex( - r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name' - ) + r'<h1>([^<]+)</h1>', webpage, 'singer name') - entries = [] - first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True - for page_num in itertools.count(1): + artist_id = self._html_search_regex( + r'data-artistid="(\d+)"', webpage, 'artist id') + + page_count = int(self._html_search_regex( + r'data-page="(\d+)"', webpage, 'page count')) + + def page_func(page_num): webpage = self._download_webpage( - 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), - singer_id, note='Download song list page #%d' % page_num, - errnote='Unable to get song list page #%d' % page_num) + 'http://www.kuwo.cn/artist/contentMusicsAjax', + singer_id, note='Download song list page #%d' % (page_num + 1), + errnote='Unable to get song list page #%d' % (page_num + 1), + query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) - entries.extend([ + return [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<p[^>]+class="m_name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/', + r'<div[^>]+class="name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) - ][:10 if first_page_only else None]) + ] - if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage): - break + entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE) return self.playlist_result(entries, singer_id, singer_name) @@ -246,7 +260,7 @@ class KuwoSingerIE(InfoExtractor): class KuwoCategoryIE(InfoExtractor): IE_NAME = 'kuwo:category' IE_DESC = '酷我音乐 - 分类' - _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', 'info_dict': { @@ -283,15 +297,21 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' IE_DESC = '酷我音乐 - MV' - _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/mv/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { 'id': '6480076', - 'ext': 'mkv', - 'title': '我们家MV', + 'ext': 'mp4', + 'title': 'My HouseMV', 'creator': '2PM', }, + # In this video, music URLs (anti.s) are blocked outside China and + # USA, while the MV URL (mvurl) is available globally, so force the MV + # URL for consistent results in different countries + 'params': { + 'format': 'mv', + }, } _FORMATS = KuwoBaseIE._FORMATS + [ {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, @@ -313,7 +333,17 @@ class KuwoMvIE(KuwoBaseIE): else: raise ExtractorError('Unable to find song or singer names') - formats = self._get_formats(song_id) + formats = self._get_formats(song_id, tolerate_ip_deny=True) + + mv_url = self._download_webpage( + 'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id, + song_id, note='Download %s MV URL' % song_id) + formats.append({ + 'url': mv_url, + 'format_id': 'mv', + }) + + self._sort_formats(formats) return { 'id': song_id, diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 5d8ebbeb3..d4fbafece 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -19,7 +19,7 @@ from ..utils import ( class Laola1TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/[^/]+/(?P<slug>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/(?P<kind>[^/]+)/(?P<slug>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -33,7 +33,7 @@ class Laola1TvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie', 'info_dict': { @@ -47,12 +47,28 @@ class Laola1TvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + }, { + 'url': 'http://www.laola1.tv/de-de/livestream/2016-03-22-belogorie-belgorod-trentino-diatec-lde', + 'info_dict': { + 'id': '487850', + 'display_id': '2016-03-22-belogorie-belgorod-trentino-diatec-lde', + 'ext': 'flv', + 'title': 'Belogorie BELGOROD - TRENTINO Diatec', + 'upload_date': '20160322', + 'uploader': 'CEV - Europäischer Volleyball Verband', + 'is_live': True, + 'categories': ['Volleyball'], + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('slug') + kind = mobj.group('kind') lang = mobj.group('lang') portal = mobj.group('portal') @@ -74,7 +90,7 @@ class Laola1TvIE(InfoExtractor): hd_doc = self._download_xml( 'http://www.laola1.tv/server/hd_video.php?%s' - % compat_urllib_parse.urlencode({ + % compat_urllib_parse_urlencode({ 'play': video_id, 'partner': partner_id, 'portal': portal, @@ -85,12 +101,17 @@ class Laola1TvIE(InfoExtractor): _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) title = _v('title', fatal=True) + VS_TARGETS = { + 'video': '2', + 'livestream': '17', + } + req = sanitized_Request( 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access?%s' % - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'videoId': video_id, - 'target': '2', - 'label': 'laola1tv', + 'target': VS_TARGETS.get(kind, '2'), + 'label': _v('label'), 'area': _v('area'), }), urlencode_postdata( @@ -109,6 +130,7 @@ class Laola1TvIE(InfoExtractor): formats = self._extract_f4m_formats( '%s?hdnea=%s&hdcore=3.2.0' % (token_attrib['url'], token_auth), video_id, f4m_id='hds') + self._sort_formats(formats) categories_str = _v('meta_sports') categories = categories_str.split(',') if categories_str else [] diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index df47e88ba..375fdaed1 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -11,7 +11,7 @@ from .common import InfoExtractor from ..compat import ( compat_ord, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( determine_ext, @@ -28,7 +28,7 @@ from ..utils import ( class LeIE(InfoExtractor): IE_DESC = '乐视网' - _VALID_URL = r'http://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html' + _VALID_URL = r'https?://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html' _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' @@ -122,7 +122,7 @@ class LeIE(InfoExtractor): 'domain': 'www.le.com' } play_json_req = sanitized_Request( - 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params) + 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params) ) cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') if cn_verification_proxy: @@ -151,7 +151,7 @@ class LeIE(InfoExtractor): for format_id in formats: if format_id in dispatch: media_url = playurl['domain'][0] + dispatch[format_id][0] - media_url += '&' + compat_urllib_parse.urlencode({ + media_url += '&' + compat_urllib_parse_urlencode({ 'm3v': 1, 'format': 1, 'expect': 3, @@ -196,7 +196,7 @@ class LeIE(InfoExtractor): class LePlaylistIE(InfoExtractor): - _VALID_URL = r'http://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)' _TESTS = [{ 'url': 'http://www.le.com/tv/46177.html', @@ -305,7 +305,7 @@ class LetvCloudIE(InfoExtractor): } self.sign_data(data) return self._download_json( - 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse.urlencode(data), + 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data), media_id, 'Downloading playJson data for type %s' % cf) play_json = get_play_json(cf, time.time()) diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index a8fd639cc..ba2f80a75 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -17,7 +17,7 @@ from ..utils import ( class LifeNewsIE(InfoExtractor): IE_NAME = 'lifenews' IE_DESC = 'LIFE | NEWS' - _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' + _VALID_URL = r'https?://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' _TESTS = [{ # single video embedded via video/source @@ -159,7 +159,7 @@ class LifeNewsIE(InfoExtractor): class LifeEmbedIE(InfoExtractor): IE_NAME = 'life:embed' - _VALID_URL = r'http://embed\.life\.ru/embed/(?P<id>[\da-f]{32})' + _VALID_URL = r'https?://embed\.life\.ru/embed/(?P<id>[\da-f]{32})' _TEST = { 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 1a0625ac3..2599d45c3 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -123,7 +123,7 @@ class LimelightBaseIE(InfoExtractor): class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' - _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:media:|https?://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})' _TESTS = [{ 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', 'info_dict': { @@ -176,7 +176,7 @@ class LimelightMediaIE(LimelightBaseIE): class LimelightChannelIE(LimelightBaseIE): IE_NAME = 'limelight:channel' - _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:channel:|https?://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', 'info_dict': { @@ -207,7 +207,7 @@ class LimelightChannelIE(LimelightBaseIE): class LimelightChannelListIE(LimelightBaseIE): IE_NAME = 'limelight:channel_list' - _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:channel_list:|https?://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', 'info_dict': { diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index 863efd896..1072405b3 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -37,6 +37,7 @@ class LRTIE(InfoExtractor): r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*location\.hash\.substring\(1\)', webpage, 'm3u8 url', group='url') formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index d4e1ae99d..86d47266f 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -4,15 +4,13 @@ import re import json from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, clean_html, int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -30,13 +28,13 @@ class LyndaBaseIE(InfoExtractor): return login_form = { - 'username': username.encode('utf-8'), - 'password': password.encode('utf-8'), + 'username': username, + 'password': password, 'remember': 'false', 'stayPut': 'false' } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) login_page = self._download_webpage( request, None, 'Logging in as %s' % username) @@ -65,7 +63,7 @@ class LyndaBaseIE(InfoExtractor): 'stayPut': 'false', } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(confirm_form)) login_page = self._download_webpage( request, None, 'Confirming log in and log out from another device') @@ -221,7 +219,7 @@ class LyndaCourseIE(LyndaBaseIE): 'Course %s does not exist' % course_id, expected=True) unaccessible_videos = 0 - videos = [] + entries = [] # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided # by single video API anymore @@ -231,20 +229,22 @@ class LyndaCourseIE(LyndaBaseIE): if video.get('HasAccess') is False: unaccessible_videos += 1 continue - if video.get('ID'): - videos.append(video['ID']) + video_id = video.get('ID') + if video_id: + entries.append({ + '_type': 'url_transparent', + 'url': 'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id), + 'ie_key': LyndaIE.ie_key(), + 'chapter': chapter.get('Title'), + 'chapter_number': int_or_none(chapter.get('ChapterIndex')), + 'chapter_id': compat_str(chapter.get('ID')), + }) if unaccessible_videos > 0: self._downloader.report_warning( '%s videos are only available for members (or paid members) and will not be downloaded. ' % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT) - entries = [ - self.url_result( - 'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id), - 'Lynda') - for video_id in videos] - course_title = course.get('Title') return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py index 7e025831b..d5945ad66 100644 --- a/youtube_dl/extractor/m6.py +++ b/youtube_dl/extractor/m6.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class M6IE(InfoExtractor): IE_NAME = 'm6' - _VALID_URL = r'http://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html' + _VALID_URL = r'https?://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html' _TEST = { 'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html', diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 71085f279..9a7098c43 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -13,7 +13,7 @@ from ..utils import ( class MailRuIE(InfoExtractor): IE_NAME = 'mailru' IE_DESC = 'Видео@Mail.Ru' - _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' _TESTS = [ { @@ -61,6 +61,10 @@ class MailRuIE(InfoExtractor): 'duration': 6001, }, 'skip': 'Not accessible from Travis CI server', + }, + { + 'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html', + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/matchtv.py b/youtube_dl/extractor/matchtv.py index 28e0dfe63..80a0d7013 100644 --- a/youtube_dl/extractor/matchtv.py +++ b/youtube_dl/extractor/matchtv.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import random from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( sanitized_Request, xpath_text, @@ -29,7 +29,7 @@ class MatchTVIE(InfoExtractor): def _real_extract(self, url): video_id = 'matchtv-live' request = sanitized_Request( - 'http://player.matchtv.ntvplus.tv/player/smil?%s' % compat_urllib_parse.urlencode({ + 'http://player.matchtv.ntvplus.tv/player/smil?%s' % compat_urllib_parse_urlencode({ 'ts': '', 'quality': 'SD', 'contentId': '561d2c0df7159b37178b4567', @@ -47,6 +47,7 @@ class MatchTVIE(InfoExtractor): video_url = self._download_json(request, video_id)['data']['videoUrl'] f4m_url = xpath_text(self._download_xml(video_url, video_id), './to') formats = self._extract_f4m_formats(f4m_url, video_id) + self._sort_formats(formats) return { 'id': video_id, 'title': self._live_title('Матч ТВ - Прямой эфир'), diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 67d6271e1..61dadb7a7 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse, compat_urllib_parse_unquote, ) from ..utils import ( @@ -13,11 +12,12 @@ from ..utils import ( ExtractorError, int_or_none, sanitized_Request, + urlencode_postdata, ) class MetacafeIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = 'metacafe' @@ -117,7 +117,7 @@ class MetacafeIE(InfoExtractor): 'filters': '0', 'submit': "Continue - I'm over 18", } - request = sanitized_Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) + request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self.report_age_confirmation() self._download_webpage(request, None, False, 'Unable to confirm age') diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py index e46b23a6f..e6730b75a 100644 --- a/youtube_dl/extractor/minhateca.py +++ b/youtube_dl/extractor/minhateca.py @@ -2,12 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( int_or_none, parse_duration, parse_filesize, sanitized_Request, + urlencode_postdata, ) @@ -39,7 +39,7 @@ class MinhatecaIE(InfoExtractor): ] req = sanitized_Request( 'http://minhateca.com.br/action/License/Download', - data=compat_urllib_parse.urlencode(token_data)) + data=urlencode_postdata(token_data)) req.add_header('Content-Type', 'application/x-www-form-urlencoded') data = self._download_json( req, video_id, note='Downloading metadata') diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 819c1b90b..1aea78d11 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -91,7 +91,7 @@ class MITIE(TechTVMITIE): class OCWMITIE(InfoExtractor): IE_NAME = 'ocw.mit.edu' - _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' + _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' _BASE_URL = 'http://ocw.mit.edu/' _TESTS = [ diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index c595f2077..7b4581dc5 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -2,11 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( - encode_dict, get_element_by_attribute, int_or_none, ) @@ -14,7 +13,7 @@ from ..utils import ( class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' - _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' + _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', @@ -60,7 +59,7 @@ class MiTeleIE(InfoExtractor): 'sta': '0', } media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data))), + '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), display_id, 'Downloading %s JSON' % location['loc']) file_ = media.get('file') if not file_: @@ -68,6 +67,7 @@ class MiTeleIE(InfoExtractor): formats.extend(self._extract_f4m_formats( file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', display_id, f4m_id=loc)) + self._sort_formats(formats) title = self._search_regex( r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', webpage, 'title') diff --git a/youtube_dl/extractor/mnet.py b/youtube_dl/extractor/mnet.py new file mode 100644 index 000000000..e3f42e7bd --- /dev/null +++ b/youtube_dl/extractor/mnet.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) + + +class MnetIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mnet\.(?:com|interest\.me)/tv/vod/(?:.*?\bclip_id=)?(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.mnet.com/tv/vod/171008', + 'info_dict': { + 'id': '171008', + 'title': 'SS_이해인@히든박스', + 'description': 'md5:b9efa592c3918b615ba69fe9f8a05c55', + 'duration': 88, + 'upload_date': '20151231', + 'timestamp': 1451564040, + 'age_limit': 0, + 'thumbnails': 'mincount:5', + 'thumbnail': 're:^https?://.*\.jpg$', + 'ext': 'flv', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://mnet.interest.me/tv/vod/172790', + 'only_matching': True, + }, { + 'url': 'http://www.mnet.com/tv/vod/vod_view.asp?clip_id=172790&tabMenu=', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._download_json( + 'http://content.api.mnet.com/player/vodConfig?id=%s&ctype=CLIP' % video_id, + video_id, 'Downloading vod config JSON')['data']['info'] + + title = info['title'] + + rtmp_info = self._download_json( + info['cdn'], video_id, 'Downloading vod cdn JSON') + + formats = [{ + 'url': rtmp_info['serverurl'] + rtmp_info['fileurl'], + 'ext': 'flv', + 'page_url': url, + 'player_url': 'http://flvfile.mnet.com/service/player/201602/cjem_player_tv.swf?v=201602191318', + }] + + description = info.get('ment') + duration = parse_duration(info.get('time')) + timestamp = parse_iso8601(info.get('date'), delimiter=' ') + age_limit = info.get('adult') + if age_limit is not None: + age_limit = 0 if age_limit == 'N' else 18 + thumbnails = [{ + 'id': thumb_format, + 'url': thumb['url'], + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + } for thumb_format, thumb in info.get('cover', {}).items() if thumb.get('url')] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py index d930b9634..978d5d5bf 100644 --- a/youtube_dl/extractor/moevideo.py +++ b/youtube_dl/extractor/moevideo.py @@ -5,11 +5,11 @@ import json import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -77,7 +77,7 @@ class MoeVideoIE(InfoExtractor): ], ] r_json = json.dumps(r) - post = compat_urllib_parse.urlencode({'r': r_json}) + post = urlencode_postdata({'r': r_json}) req = sanitized_Request(self._API_URL, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py index f6bf94f2f..b208820fe 100644 --- a/youtube_dl/extractor/moniker.py +++ b/youtube_dl/extractor/moniker.py @@ -5,11 +5,11 @@ import os.path import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, remove_start, sanitized_Request, + urlencode_postdata, ) @@ -88,7 +88,7 @@ class MonikerIE(InfoExtractor): fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) data = dict(fields) - post = compat_urllib_parse.urlencode(data) + post = urlencode_postdata(data) headers = { b'Content-Type': b'application/x-www-form-urlencoded', } diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py index 7cc7f054f..a85109a89 100644 --- a/youtube_dl/extractor/mooshare.py +++ b/youtube_dl/extractor/mooshare.py @@ -3,17 +3,17 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, sanitized_Request, + urlencode_postdata, ) class MooshareIE(InfoExtractor): IE_NAME = 'mooshare' IE_DESC = 'Mooshare.biz' - _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})' + _VALID_URL = r'https?://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})' _TESTS = [ { @@ -58,7 +58,7 @@ class MooshareIE(InfoExtractor): } request = sanitized_Request( - 'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form)) + 'http://mooshare.biz/%s' % video_id, urlencode_postdata(download_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self._sleep(5, video_id) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 0b4787c1d..5e1a8a71a 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -12,7 +12,7 @@ from ..utils import ( class MotherlessIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' _TESTS = [{ 'url': 'http://motherless.com/AC3FFE1', 'md5': '310f62e325a9fafe64f68c0bccb6e75f', @@ -69,6 +69,9 @@ class MotherlessIE(InfoExtractor): ">The page you're looking for cannot be found.<")): raise ExtractorError('Video %s does not exist' % video_id, expected=True) + if '>The content you are trying to view is for friends only.' in webpage: + raise ExtractorError('Video %s is for friends only' % video_id, expected=True) + title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') video_url = self._html_search_regex( diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index c1a482dba..370328b36 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -9,7 +9,7 @@ from ..compat import ( class MotorsportIE(InfoExtractor): IE_DESC = 'motorsport.com' - _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' + _VALID_URL = r'https?://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/', 'info_dict': { diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index 1564cb71f..d0cb8278e 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -2,39 +2,48 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import sanitized_Request +from ..utils import ( + smuggle_url, + float_or_none, + parse_iso8601, + update_url_query, +) class MovieClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/.+-(?P<id>\d+)(?:\?|$)' _TEST = { - 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597?autoPlay=true&playlistId=5', + 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597', + 'md5': '42b5a0352d4933a7bd54f2104f481244', 'info_dict': { 'id': 'pKIGmG83AqD9', - 'display_id': 'warcraft-trailer-1-561180739597', 'ext': 'mp4', 'title': 'Warcraft Trailer 1', 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.', 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1446843055, + 'upload_date': '20151106', + 'uploader': 'Movieclips', }, 'add_ie': ['ThePlatform'], } def _real_extract(self, url): - display_id = self._match_id(url) - - req = sanitized_Request(url) - # it doesn't work if it thinks the browser it's too old - req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/43.0 (Chrome)') - webpage = self._download_webpage(req, display_id) - theplatform_link = self._html_search_regex(r'src="(http://player.theplatform.com/p/.*?)"', webpage, 'theplatform link') - title = self._html_search_regex(r'<title[^>]*>([^>]+)-\s*\d+\s*|\s*Movieclips.com', webpage, 'title') - description = self._html_search_meta('description', webpage) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video = next(v for v in self._parse_json(self._search_regex( + r'var\s+__REACT_ENGINE__\s*=\s*({.+});', + webpage, 'react engine'), video_id)['playlist']['videos'] if v['id'] == video_id) return { '_type': 'url_transparent', - 'url': theplatform_link, - 'title': title, - 'display_id': display_id, - 'description': description, + 'ie_key': 'ThePlatform', + 'url': smuggle_url(update_url_query( + video['contentUrl'], {'mbr': 'true'}), {'force_smil_url': True}), + 'title': self._og_search_title(webpage), + 'description': self._html_search_meta('description', webpage), + 'duration': float_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('dateCreated')), + 'thumbnail': video.get('defaultImage'), + 'uploader': video.get('provider'), } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index ed068365d..640ee3d93 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -4,7 +4,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_str, ) from ..utils import ( @@ -17,6 +17,7 @@ from ..utils import ( unescapeHTML, url_basename, RegexNotFoundError, + xpath_text, ) @@ -130,11 +131,7 @@ class MTVServicesInfoExtractor(InfoExtractor): message += item.text raise ExtractorError(message, expected=True) - description_node = itemdoc.find('description') - if description_node is not None: - description = description_node.text.strip() - else: - description = None + description = xpath_text(itemdoc, 'description') title_el = None if title_el is None: @@ -174,7 +171,7 @@ class MTVServicesInfoExtractor(InfoExtractor): data = {'uri': uri} if self._LANG: data['lang'] = self._LANG - return compat_urllib_parse.urlencode(data) + return compat_urllib_parse_urlencode(data) def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py index 1e9cf8de9..cbc800481 100644 --- a/youtube_dl/extractor/muzu.py +++ b/youtube_dl/extractor/muzu.py @@ -1,9 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) +from ..compat import compat_urllib_parse_urlencode class MuzuTVIE(InfoExtractor): @@ -25,7 +23,7 @@ class MuzuTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - info_data = compat_urllib_parse.urlencode({ + info_data = compat_urllib_parse_urlencode({ 'format': 'json', 'url': url, }) @@ -41,7 +39,7 @@ class MuzuTVIE(InfoExtractor): if video_info.get('v%s' % quality): break - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'ai': video_id, # Even if each time you watch a video the hash changes, # it seems to work for different videos, and it will work diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index 83414a232..0d5238d77 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor -from ..compat import ( - compat_str, +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, ) -from ..utils import ExtractorError class MySpaceIE(InfoExtractor): @@ -24,6 +24,8 @@ class MySpaceIE(InfoExtractor): 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', 'uploader': 'Five Minutes to the Stage', 'uploader_id': 'fiveminutestothestage', + 'timestamp': 1414108751, + 'upload_date': '20141023', }, 'params': { # rtmp download @@ -64,7 +66,7 @@ class MySpaceIE(InfoExtractor): 'ext': 'mp4', 'title': 'Starset - First Light', 'description': 'md5:2d5db6c9d11d527683bcda818d332414', - 'uploader': 'Jacob Soren', + 'uploader': 'Yumi K', 'uploader_id': 'SorenPromotions', 'upload_date': '20140725', } @@ -78,6 +80,19 @@ class MySpaceIE(InfoExtractor): player_url = self._search_regex( r'playerSwf":"([^"?]*)', webpage, 'player URL') + def rtmp_format_from_stream_url(stream_url, width=None, height=None): + rtmp_url, play_path = stream_url.split(';', 1) + return { + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': play_path, + 'player_url': player_url, + 'protocol': 'rtmp', + 'ext': 'flv', + 'width': width, + 'height': height, + } + if mobj.group('mediatype').startswith('music/song'): # songs don't store any useful info in the 'context' variable song_data = self._search_regex( @@ -93,8 +108,8 @@ class MySpaceIE(InfoExtractor): return self._search_regex( r'''data-%s=([\'"])(?P.*?)\1''' % name, song_data, name, default='', group='data') - streamUrl = search_data('stream-url') - if not streamUrl: + stream_url = search_data('stream-url') + if not stream_url: vevo_id = search_data('vevo-id') youtube_id = search_data('youtube-id') if vevo_id: @@ -106,36 +121,47 @@ class MySpaceIE(InfoExtractor): else: raise ExtractorError( 'Found song but don\'t know how to download it') - info = { + return { 'id': video_id, 'title': self._og_search_title(webpage), 'uploader': search_data('artist-name'), 'uploader_id': search_data('artist-username'), 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': int_or_none(search_data('duration')), + 'formats': [rtmp_format_from_stream_url(stream_url)] } else: - context = json.loads(self._search_regex( - r'context = ({.*?});', webpage, 'context')) - video = context['video'] - streamUrl = video['streamUrl'] - info = { - 'id': compat_str(video['mediaId']), + video = self._parse_json(self._search_regex( + r'context = ({.*?});', webpage, 'context'), + video_id)['video'] + formats = [] + hls_stream_url = video.get('hlsStreamUrl') + if hls_stream_url: + formats.append({ + 'format_id': 'hls', + 'url': hls_stream_url, + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + stream_url = video.get('streamUrl') + if stream_url: + formats.append(rtmp_format_from_stream_url( + stream_url, + int_or_none(video.get('width')), + int_or_none(video.get('height')))) + self._sort_formats(formats) + return { + 'id': video_id, 'title': video['title'], - 'description': video['description'], - 'thumbnail': video['imageUrl'], - 'uploader': video['artistName'], - 'uploader_id': video['artistUsername'], + 'description': video.get('description'), + 'thumbnail': video.get('imageUrl'), + 'uploader': video.get('artistName'), + 'uploader_id': video.get('artistUsername'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('dateAdded')), + 'formats': formats, } - rtmp_url, play_path = streamUrl.split(';', 1) - info.update({ - 'url': rtmp_url, - 'play_path': play_path, - 'player_url': player_url, - 'ext': 'flv', - }) - return info - class MySpaceAlbumIE(InfoExtractor): IE_NAME = 'MySpace:album' diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index f936b92bb..1ca7b1a9e 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -11,7 +11,7 @@ from ..utils import ( class MySpassIE(InfoExtractor): - _VALID_URL = r'http://www\.myspass\.de/.*' + _VALID_URL = r'https?://www\.myspass\.de/.*' _TEST = { 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', 'md5': '0b49f4844a068f8b33f4b7c88405862b', diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index 1e21cf98a..6d447a493 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -9,8 +9,8 @@ import json from .common import InfoExtractor from ..compat import ( compat_ord, - compat_urllib_parse, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, @@ -20,7 +20,7 @@ from ..utils import ( class MyVideoIE(InfoExtractor): _WORKING = False - _VALID_URL = r'http://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P[0-9]+)/[^?/]+.*' + _VALID_URL = r'https?://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P[0-9]+)/[^?/]+.*' IE_NAME = 'myvideo' _TEST = { 'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', @@ -112,7 +112,7 @@ class MyVideoIE(InfoExtractor): encxml = compat_urllib_parse_unquote(b) if not params.get('domain'): params['domain'] = 'www.myvideo.de' - xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params)) + xmldata_url = '%s?%s' % (encxml, compat_urllib_parse_urlencode(params)) if 'flash_playertype=MTV' in xmldata_url: self._downloader.report_warning('avoiding MTV player') xmldata_url = ( diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py index a94ab8358..731c24542 100644 --- a/youtube_dl/extractor/myvidster.py +++ b/youtube_dl/extractor/myvidster.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class MyVidsterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?myvidster\.com/video/(?P\d+)/' + _VALID_URL = r'https?://(?:www\.)?myvidster\.com/video/(?P\d+)/' _TEST = { 'url': 'http://www.myvidster.com/video/32059805/Hot_chemistry_with_raw_love_making', diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 6fc9e7b05..722518663 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -4,30 +4,40 @@ from .common import InfoExtractor from ..utils import ( smuggle_url, url_basename, + update_url_query, ) class NationalGeographicIE(InfoExtractor): - _VALID_URL = r'http://video\.nationalgeographic\.com/.*?' + IE_NAME = 'natgeo' + _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?' _TESTS = [ { 'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo', + 'md5': '730855d559abbad6b42c2be1fa584917', 'info_dict': { - 'id': '4DmDACA6Qtk_', - 'ext': 'flv', + 'id': '0000014b-70a1-dd8c-af7f-f7b559330001', + 'ext': 'mp4', 'title': 'Mating Crabs Busted by Sharks', 'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3', + 'timestamp': 1423523799, + 'upload_date': '20150209', + 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], }, { 'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws', + 'md5': '6a3105eb448c070503b3105fb9b320b5', 'info_dict': { - 'id': '_JeBD_D7PlS5', - 'ext': 'flv', + 'id': 'ngc-I0IauNSWznb_UV008GxSbwY35BZvgi2e', + 'ext': 'mp4', 'title': 'The Real Jaws', 'description': 'md5:8d3e09d9d53a85cd397b4b21b2c77be6', + 'timestamp': 1433772632, + 'upload_date': '20150608', + 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], }, @@ -37,18 +47,67 @@ class NationalGeographicIE(InfoExtractor): name = url_basename(url) webpage = self._download_webpage(url, name) - feed_url = self._search_regex( - r'data-feed-url="([^"]+)"', webpage, 'feed url') guid = self._search_regex( r'id="(?:videoPlayer|player-container)"[^>]+data-guid="([^"]+)"', webpage, 'guid') - feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name) - content = feed.find('.//{http://search.yahoo.com/mrss/}content') - theplatform_id = url_basename(content.attrib.get('url')) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/ngs/media/guid/2423130747/%s?mbr=true' % guid, + {'force_smil_url': True}), + 'id': guid, + } - return self.url_result(smuggle_url( - 'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id, - # For some reason, the normal links don't work and we must force - # the use of f4m - {'force_smil_url': True})) + +class NationalGeographicChannelIE(InfoExtractor): + IE_NAME = 'natgeo:channel' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P[^/?]+)' + + _TESTS = [ + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', + 'md5': '518c9aa655686cf81493af5cc21e2a04', + 'info_dict': { + 'id': 'nB5vIAfmyllm', + 'ext': 'mp4', + 'title': 'Uncovering a Universal Knowledge', + 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a', + 'timestamp': 1458680907, + 'upload_date': '20160322', + 'uploader': 'NEWA-FNG-NGTV', + }, + 'add_ie': ['ThePlatform'], + }, + { + 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', + 'md5': 'c4912f656b4cbe58f3e000c489360989', + 'info_dict': { + 'id': '3TmMv9OvGwIR', + 'ext': 'mp4', + 'title': 'The Stunning Red Bird of Paradise', + 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c', + 'timestamp': 1459362152, + 'upload_date': '20160330', + 'uploader': 'NEWA-FNG-NGTV', + }, + 'add_ie': ['ThePlatform'], + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + release_url = self._search_regex( + r'video_auth_playlist_url\s*=\s*"([^"]+)"', + webpage, 'release url') + + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}), + {'force_smil_url': True}), + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 1f5fc2145..6d6f69b44 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -53,8 +53,8 @@ class NaverIE(InfoExtractor): raise ExtractorError('couldn\'t extract vid and key') vid = m_id.group(1) key = m_id.group(2) - query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key, }) - query_urls = compat_urllib_parse.urlencode({ + query = compat_urllib_parse_urlencode({'vid': vid, 'inKey': key, }) + query_urls = compat_urllib_parse_urlencode({ 'masterVid': vid, 'protocol': 'p2p', 'inKey': key, diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 3e2b3e599..d896b0d04 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -97,7 +97,7 @@ class NBAIE(InfoExtractor): _PAGE_SIZE = 30 def _fetch_page(self, team, video_id, page): - search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse.urlencode({ + search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse_urlencode({ 'type': 'teamvideo', 'start': page * self._PAGE_SIZE + 1, 'npp': (page + 1) * self._PAGE_SIZE + 1, diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 2202cfa33..e67025ff6 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -3,13 +3,16 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from .theplatform import ThePlatformIE from ..utils import ( - ExtractorError, find_xpath_attr, lowercase_escape, smuggle_url, unescapeHTML, + update_url_query, + int_or_none, + HEADRequest, + parse_iso8601, ) @@ -24,6 +27,9 @@ class NBCIE(InfoExtractor): 'ext': 'mp4', 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.', + 'timestamp': 1424246400, + 'upload_date': '20150218', + 'uploader': 'NBCU-COM', }, 'params': { # m3u8 download @@ -47,6 +53,9 @@ class NBCIE(InfoExtractor): 'ext': 'mp4', 'title': 'Star Wars Teaser', 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', + 'timestamp': 1417852800, + 'upload_date': '20141206', + 'uploader': 'NBCU-COM', }, 'params': { # m3u8 download @@ -75,6 +84,7 @@ class NBCIE(InfoExtractor): theplatform_url = 'http:' + theplatform_url return { '_type': 'url_transparent', + 'ie_key': 'ThePlatform', 'url': smuggle_url(theplatform_url, {'source_url': url}), 'id': video_id, } @@ -90,6 +100,9 @@ class NBCSportsVPlayerIE(InfoExtractor): 'ext': 'flv', 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + 'timestamp': 1426270238, + 'upload_date': '20150313', + 'uploader': 'NBCU-SPORTS', } }, { 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', @@ -112,7 +125,7 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): # Does not include https because its certificate is invalid - _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' + _VALID_URL = r'https?://www\.nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' _TEST = { 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', @@ -131,10 +144,37 @@ class NBCSportsIE(InfoExtractor): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') -class NBCNewsIE(InfoExtractor): +class CSNNEIE(InfoExtractor): + _VALID_URL = r'https?://www\.csnne\.com/video/(?P[0-9a-z-]+)' + + _TEST = { + 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter', + 'info_dict': { + 'id': 'yvBLLUgQ8WU0', + 'ext': 'mp4', + 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.', + 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3', + 'timestamp': 1459369979, + 'upload_date': '20160330', + 'uploader': 'NBCU-SPORTS', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': self._html_search_meta('twitter:player:stream', webpage), + 'display_id': display_id, + } + + +class NBCNewsIE(ThePlatformIE): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P\d+)| - (?:watch|feature|nightly-news)/[^/]+/(?P.+)) + ([^/]+/)*(?P<display_id>[^/?]+)) ''' _TESTS = [ @@ -149,15 +189,14 @@ class NBCNewsIE(InfoExtractor): }, }, { - 'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236', - 'md5': 'b2421750c9f260783721d898f4c42063', + 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', + 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': 'I1wpAI_zmhsQ', + 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', }, - 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', @@ -168,17 +207,29 @@ class NBCNewsIE(InfoExtractor): 'title': 'FULL EPISODE: Family Business', 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', }, + 'skip': 'This page is unavailable.', }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d', + 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': 'sekXqyTVnmN3', + 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', }, }, + { + 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', + 'md5': 'a49e173825e5fcd15c13fc297fced39d', + 'info_dict': { + 'id': '529953347624', + 'ext': 'mp4', + 'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', + 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', + }, + 'expected_warnings': ['http-6000 is not available'] + }, { 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', 'only_matching': True, @@ -202,55 +253,86 @@ class NBCNewsIE(InfoExtractor): } else: # "feature" and "nightly-news" pages use theplatform.com - title = mobj.group('title') - webpage = self._download_webpage(url, title) + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + info = None bootstrap_json = self._search_regex( - r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', - webpage, 'bootstrap json', flags=re.MULTILINE) - bootstrap = self._parse_json(bootstrap_json, video_id) - info = bootstrap['results'][0]['video'] - mpxid = info['mpxId'] + r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', + webpage, 'bootstrap json', default=None) + if bootstrap_json: + bootstrap = self._parse_json(bootstrap_json, display_id) + info = bootstrap['results'][0]['video'] + else: + player_instance_json = self._search_regex( + r'videoObj\s*:\s*({.+})', webpage, 'player instance') + info = self._parse_json(player_instance_json, display_id) + video_id = info['mpxId'] + title = info['title'] - base_urls = [ - info['fallbackPlaylistUrl'], - info['associatedPlaylistUrl'], - ] + subtitles = {} + caption_links = info.get('captionLinks') + if caption_links: + for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): + sub_url = caption_links.get(sub_key) + if sub_url: + subtitles.setdefault('en', []).append({ + 'url': sub_url, + 'ext': sub_ext, + }) - for base_url in base_urls: - if not base_url: + formats = [] + for video_asset in info['videoAssets']: + video_url = video_asset.get('publicUrl') + if not video_url: continue - playlist_url = base_url + '?form=MPXNBCNewsAPI' - - try: - all_videos = self._download_json(playlist_url, title) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError): - continue - raise - - if not all_videos or 'videos' not in all_videos: + container = video_asset.get('format') + asset_type = video_asset.get('assetType') or '' + if container == 'ISM' or asset_type == 'FireTV-Once': continue - - try: - info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid) - break - except StopIteration: - continue - - if info is None: - raise ExtractorError('Could not find video in playlists') + elif asset_type == 'OnceURL': + tp_formats, tp_subtitles = self._extract_theplatform_smil( + video_url, video_id) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + else: + tbr = int_or_none(video_asset.get('bitRate'), 1000) + format_id = 'http%s' % ('-%d' % tbr if tbr else '') + video_url = update_url_query( + video_url, {'format': 'redirect'}) + # resolve the url so that we can check availability and detect the correct extension + head = self._request_webpage( + HEADRequest(video_url), video_id, + 'Checking %s url' % format_id, + '%s is not available' % format_id, + fatal=False) + if head: + video_url = head.geturl() + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(video_asset.get('width')), + 'height': int_or_none(video_asset.get('height')), + 'tbr': tbr, + 'container': video_asset.get('format'), + }) + self._sort_formats(formats) return { - '_type': 'url', - # We get the best quality video - 'url': info['videoAssets'][-1]['publicUrl'], - 'ie_key': 'ThePlatform', + 'id': video_id, + 'title': title, + 'description': info.get('description'), + 'thumbnail': info.get('description'), + 'thumbnail': info.get('thumbnail'), + 'duration': int_or_none(info.get('duration')), + 'timestamp': parse_iso8601(info.get('pubDate')), + 'formats': formats, + 'subtitles': subtitles, } class MSNBCIE(InfoExtractor): # https URLs redirect to corresponding http ones - _VALID_URL = r'http://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', @@ -262,6 +344,7 @@ class MSNBCIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', + 'uploader': 'NBCU-NEWS', 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], }, } diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index 7830616f8..0d36474fa 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -8,7 +8,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_str, compat_itertools_count, ) @@ -153,7 +153,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'ids': '[%s]' % song_id } info = self.query_api( - 'song/detail?' + compat_urllib_parse.urlencode(params), + 'song/detail?' + compat_urllib_parse_urlencode(params), song_id, 'Downloading song info')['songs'][0] formats = self.extract_formats(info) diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index d1688457f..aae7aeeeb 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -7,7 +7,7 @@ from ..utils import parse_iso8601 class NextMediaIE(InfoExtractor): IE_DESC = '蘋果日報' - _VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)' + _VALID_URL = r'https?://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199', 'md5': 'dff9fad7009311c421176d1ac90bfe4f', @@ -68,7 +68,7 @@ class NextMediaIE(InfoExtractor): class NextMediaActionNewsIE(NextMediaIE): IE_DESC = '蘋果日報 - 動新聞' - _VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+' + _VALID_URL = r'https?://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+' _TESTS = [{ 'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460', 'md5': '05fce8ffeed7a5e00665d4b7cf0f9201', @@ -93,7 +93,7 @@ class NextMediaActionNewsIE(NextMediaIE): class AppleDailyIE(NextMediaIE): IE_DESC = '臺灣蘋果日報' - _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' + _VALID_URL = r'https?://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py index 657ae77a0..9ccd7d774 100644 --- a/youtube_dl/extractor/nextmovie.py +++ b/youtube_dl/extractor/nextmovie.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode class NextMovieIE(MTVServicesInfoExtractor): @@ -20,7 +20,7 @@ class NextMovieIE(MTVServicesInfoExtractor): }] def _get_feed_query(self, uri): - return compat_urllib_parse.urlencode({ + return compat_urllib_parse_urlencode({ 'feed': '1505', 'mgid': uri, }) diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py index 5bd15f7a7..51e4a34f7 100644 --- a/youtube_dl/extractor/nfb.py +++ b/youtube_dl/extractor/nfb.py @@ -1,8 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse -from ..utils import sanitized_Request +from ..utils import ( + sanitized_Request, + urlencode_postdata, +) class NFBIE(InfoExtractor): @@ -40,7 +42,7 @@ class NFBIE(InfoExtractor): request = sanitized_Request( 'https://www.nfb.ca/film/%s/player_config' % video_id, - compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii')) + urlencode_postdata({'getConfig': 'true'})) request.add_header('Content-Type', 'application/x-www-form-urlencoded') request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf') diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 8d5ce46ad..c1dea8b6c 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -7,7 +7,7 @@ import os from .common import InfoExtractor from ..compat import ( compat_urlparse, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse ) from ..utils import ( @@ -38,7 +38,7 @@ class NHLBaseInfoExtractor(InfoExtractor): parsed_url = compat_urllib_parse_urlparse(initial_video_url) filename, ext = os.path.splitext(parsed_url.path) path = '%s_sd%s' % (filename, ext) - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'type': 'fvod', 'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:]) }) @@ -211,7 +211,7 @@ class NHLVideocenterIE(NHLBaseInfoExtractor): r'tab0"[^>]*?>(.*?)</td>', webpage, 'playlist title', flags=re.DOTALL).lower().capitalize() - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'cid': cat_id, # This is the default value 'count': 12, diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index b62819ae5..ce065f2b0 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode class NickIE(MTVServicesInfoExtractor): @@ -54,7 +54,7 @@ class NickIE(MTVServicesInfoExtractor): }] def _get_feed_query(self, uri): - return compat_urllib_parse.urlencode({ + return compat_urllib_parse_urlencode({ 'feed': 'nick_arc_player_prime', 'mgid': uri, }) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 586e52a4a..dd75a48af 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -7,11 +7,10 @@ import datetime from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( - encode_dict, ExtractorError, int_or_none, parse_duration, @@ -19,6 +18,7 @@ from ..utils import ( sanitized_Request, xpath_text, determine_ext, + urlencode_postdata, ) @@ -101,7 +101,7 @@ class NiconicoIE(InfoExtractor): 'mail': username, 'password': password, } - login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8') + login_data = urlencode_postdata(login_form_strs) request = sanitized_Request( 'https://secure.nicovideo.jp/secure/login', login_data) login_results = self._download_webpage( @@ -141,7 +141,7 @@ class NiconicoIE(InfoExtractor): r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey') # Get flv info - flv_info_data = compat_urllib_parse.urlencode({ + flv_info_data = compat_urllib_parse_urlencode({ 'k': thumb_play_key, 'v': video_id }) diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index d440313d5..06f2bda07 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -8,7 +8,6 @@ import hashlib from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, compat_urlparse, ) from ..utils import ( @@ -18,11 +17,12 @@ from ..utils import ( float_or_none, parse_iso8601, sanitized_Request, + urlencode_postdata, ) class NocoIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' _LOGIN_URL = 'http://noco.tv/do.php' _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s' _SUB_LANG_TEMPLATE = '&sub_lang=%s' @@ -75,7 +75,7 @@ class NocoIE(InfoExtractor): 'username': username, 'password': password, } - request = sanitized_Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) + request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') login = self._download_json(request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 5952d136f..77e091072 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -9,7 +9,7 @@ from ..utils import ( class NormalbootsIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' + _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' _TEST = { 'url': 'http://normalboots.com/video/home-alone-games-jontron/', 'md5': '8bf6de238915dd501105b44ef5f1e0f6', diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 3f9c776ef..17671ad39 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -12,7 +12,7 @@ from ..utils import ( class NovaIE(InfoExtractor): IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' - _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' + _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' _TESTS = [{ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', 'info_dict': { diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index d68c1ad79..a131f7dbd 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -7,7 +7,6 @@ from ..compat import compat_urlparse from ..utils import ( ExtractorError, NO_DEFAULT, - encode_dict, sanitized_Request, urlencode_postdata, ) @@ -73,7 +72,7 @@ class NovaMovIE(InfoExtractor): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(url, post_url) request = sanitized_Request( - post_url, urlencode_postdata(encode_dict(fields))) + post_url, urlencode_postdata(fields)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') request.add_header('Referer', post_url) webpage = self._download_webpage( diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index 446f5901c..74860eb20 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -63,8 +63,11 @@ class NownessIE(NownessBaseIE): 'title': 'Candor: The Art of Gesticulation', 'description': 'Candor: The Art of Gesticulation', 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'Nowness', + 'timestamp': 1446745676, + 'upload_date': '20151105', + 'uploader_id': '2385340575001', }, + 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr', 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', @@ -74,8 +77,11 @@ class NownessIE(NownessBaseIE): 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'Nowness', + 'timestamp': 1407315371, + 'upload_date': '20140806', + 'uploader_id': '2385340575001', }, + 'add_ie': ['BrightcoveNew'], }, { # vimeo 'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut', @@ -90,6 +96,7 @@ class NownessIE(NownessBaseIE): 'uploader': 'Cinema Sem Lei', 'uploader_id': 'cinemasemlei', }, + 'add_ie': ['Vimeo'], }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/noz.py b/youtube_dl/extractor/noz.py index 4e60b13a5..c47a33d15 100644 --- a/youtube_dl/extractor/noz.py +++ b/youtube_dl/extractor/noz.py @@ -2,9 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_xpath, +) from ..utils import ( int_or_none, + find_xpath_attr, xpath_text, update_url_query, ) @@ -46,9 +50,10 @@ class NozIE(InfoExtractor): duration = int_or_none(xpath_text( doc, './/article/movie/file/duration')) formats = [] - for qnode in doc.findall('.//article/movie/file/qualities/qual'): - http_url = xpath_text( - qnode, './html_urls/video_url[@format="video/mp4"]') + for qnode in doc.findall(compat_xpath('.//article/movie/file/qualities/qual')): + http_url_ele = find_xpath_attr( + qnode, './html_urls/video_url', 'format', 'video/mp4') + http_url = http_url_ele.text if http_url_ele is not None else None if http_url: formats.append({ 'url': http_url, @@ -64,8 +69,10 @@ class NozIE(InfoExtractor): formats.extend(self._extract_f4m_formats( update_url_query(f4m_url, {'hdcore': '3.4.0'}), video_id, f4m_id='hds', fatal=False)) - m3u8_url = xpath_text( - qnode, './html_urls/video_url[@format="application/vnd.apple.mpegurl"]') + m3u8_url_ele = find_xpath_attr( + qnode, './html_urls/video_url', + 'format', 'application/vnd.apple.mpegurl') + m3u8_url = m3u8_url_ele.text if m3u8_url_ele is not None else None if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 125c7010b..1777aa10b 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( int_or_none, qualities, @@ -9,7 +9,7 @@ from ..utils import ( class NprIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205', 'info_dict': { @@ -38,7 +38,7 @@ class NprIE(InfoExtractor): playlist_id = self._match_id(url) config = self._download_json( - 'http://api.npr.org/query?%s' % compat_urllib_parse.urlencode({ + 'http://api.npr.org/query?%s' % compat_urllib_parse_urlencode({ 'id': playlist_id, 'fields': 'titles,audio,show', 'format': 'json', diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 3b21fbd4d..9df200822 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -63,6 +63,7 @@ class NRKIE(InfoExtractor): if determine_ext(media_url) == 'f4m': formats = self._extract_f4m_formats( media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds') + self._sort_formats(formats) else: formats = [{ 'url': media_url, diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py index 2cd924d05..0895d7ea4 100644 --- a/youtube_dl/extractor/ntvru.py +++ b/youtube_dl/extractor/ntvru.py @@ -11,7 +11,7 @@ from ..utils import ( class NTVRuIE(InfoExtractor): IE_NAME = 'ntv.ru' - _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?P<id>.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 7f254b867..681683e86 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -18,8 +18,9 @@ class NYTimesBaseIE(InfoExtractor): description = video_data.get('summary') duration = float_or_none(video_data.get('duration'), 1000) - uploader = video_data['byline'] - timestamp = parse_iso8601(video_data['publication_date'][:-8]) + uploader = video_data.get('byline') + publication_date = video_data.get('publication_date') + timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None def get_file_size(file_size): if isinstance(file_size, int): @@ -37,7 +38,7 @@ class NYTimesBaseIE(InfoExtractor): 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), 'filesize': get_file_size(video.get('fileSize')), - } for video in video_data['renditions'] + } for video in video_data['renditions'] if video.get('url') ] self._sort_formats(formats) @@ -46,7 +47,7 @@ class NYTimesBaseIE(InfoExtractor): 'url': 'http://www.nytimes.com/%s' % image['url'], 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), - } for image in video_data['images'] + } for image in video_data.get('images', []) if image.get('url') ] return { diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py new file mode 100644 index 000000000..1bf96ea56 --- /dev/null +++ b/youtube_dl/extractor/once.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class OnceIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.unicornmedia\.com/now/[^/]+/[^/]+/(?P<domain_id>[^/]+)/(?P<application_id>[^/]+)/(?:[^/]+/)?(?P<media_item_id>[^/]+)/content\.(?:once|m3u8|mp4)' + ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8' + PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4' + + def _extract_once_formats(self, url): + domain_id, application_id, media_item_id = re.match( + OnceIE._VALID_URL, url).groups() + formats = self._extract_m3u8_formats( + self.ADAPTIVE_URL_TEMPLATE % ( + domain_id, application_id, media_item_id), + media_item_id, 'mp4', m3u8_id='hls', fatal=False) + progressive_formats = [] + for adaptive_format in formats: + # Prevent advertisement from embedding into m3u8 playlist (see + # https://github.com/rg3/youtube-dl/issues/8893#issuecomment-199912684) + adaptive_format['url'] = re.sub( + r'\badsegmentlength=\d+', r'adsegmentlength=0', adaptive_format['url']) + rendition_id = self._search_regex( + r'/now/media/playlist/[^/]+/[^/]+/([^/]+)', + adaptive_format['url'], 'redition id', default=None) + if rendition_id: + progressive_format = adaptive_format.copy() + progressive_format.update({ + 'url': self.PROGRESSIVE_URL_TEMPLATE % ( + domain_id, application_id, rendition_id, media_item_id), + 'format_id': adaptive_format['format_id'].replace( + 'hls', 'http'), + 'protocol': 'http', + }) + progressive_formats.append(progressive_format) + self._check_formats(progressive_formats, media_item_id) + formats.extend(progressive_formats) + return formats diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 20b984288..16f040191 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -9,7 +9,7 @@ from ..utils import ( ExtractorError, unsmuggle_url, ) -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode class OoyalaBaseIE(InfoExtractor): @@ -35,7 +35,7 @@ class OoyalaBaseIE(InfoExtractor): for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): auth_data = self._download_json( self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'domain': domain, 'supportedFormats': supported_format }), diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py new file mode 100644 index 000000000..4468f31fc --- /dev/null +++ b/youtube_dl/extractor/openload.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_chr +from ..utils import ( + encode_base_n, + ExtractorError, +) + + +class OpenloadIE(InfoExtractor): + _VALID_URL = r'https://openload.(?:co|io)/(?:f|embed)/(?P<id>[a-zA-Z0-9-]+)' + + _TESTS = [{ + 'url': 'https://openload.co/f/kUEfGclsU9o', + 'md5': 'bf1c059b004ebc7a256f89408e65c36e', + 'info_dict': { + 'id': 'kUEfGclsU9o', + 'ext': 'mp4', + 'title': 'skyrim_no-audio_1080.mp4', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4', + 'only_matching': True, + }, { + 'url': 'https://openload.io/f/ZAn6oz-VZGE/', + 'only_matching': True, + }] + + @staticmethod + def openload_level2_debase(m): + radix, num = int(m.group(1)) + 27, int(m.group(2)) + return '"' + encode_base_n(num, radix) + '"' + + @classmethod + def openload_level2(cls, txt): + # The function name is ǃ \u01c3 + # Using escaped unicode literals does not work in Python 3.2 + return re.sub(r'ǃ\((\d+),(\d+)\)', cls.openload_level2_debase, txt, re.UNICODE).replace('"+"', '') + + # Openload uses a variant of aadecode + # openload_decode and related functions are originally written by + # vitas@matfyz.cz and released with public domain + # See https://github.com/rg3/youtube-dl/issues/8489 + @classmethod + def openload_decode(cls, txt): + symbol_table = [ + ('_', '(゚Д゚) [゚Θ゚]'), + ('a', '(゚Д゚) [゚ω゚ノ]'), + ('b', '(゚Д゚) [゚Θ゚ノ]'), + ('c', '(゚Д゚) [\'c\']'), + ('d', '(゚Д゚) [゚ー゚ノ]'), + ('e', '(゚Д゚) [゚Д゚ノ]'), + ('f', '(゚Д゚) [1]'), + + ('o', '(゚Д゚) [\'o\']'), + ('u', '(o゚ー゚o)'), + ('c', '(゚Д゚) [\'c\']'), + + ('7', '((゚ー゚) + (o^_^o))'), + ('6', '((o^_^o) +(o^_^o) +(c^_^o))'), + ('5', '((゚ー゚) + (゚Θ゚))'), + ('4', '(-~3)'), + ('3', '(-~-~1)'), + ('2', '(-~1)'), + ('1', '(-~0)'), + ('0', '((c^_^o)-(c^_^o))'), + ] + delim = '(゚Д゚)[゚ε゚]+' + ret = '' + for aachar in txt.split(delim): + for val, pat in symbol_table: + aachar = aachar.replace(pat, val) + aachar = aachar.replace('+ ', '') + m = re.match(r'^\d+', aachar) + if m: + ret += compat_chr(int(m.group(0), 8)) + else: + m = re.match(r'^u([\da-f]+)', aachar) + if m: + ret += compat_chr(int(m.group(1), 16)) + return cls.openload_level2(ret) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if 'File not found' in webpage: + raise ExtractorError('File not found', expected=True) + + code = self._search_regex( + r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>', + webpage, 'JS code') + + video_url = self._search_regex( + r'return\s+"(https?://[^"]+)"', self.openload_decode(code), 'video URL') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'url': video_url, + } diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 958eb398b..66c75f8b3 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -137,7 +137,7 @@ class ORFTVthekIE(InfoExtractor): class ORFOE1IE(InfoExtractor): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'http://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' # Audios on ORF radio are only available for 7 days, so we can't add tests. _TEST = { @@ -171,7 +171,7 @@ class ORFOE1IE(InfoExtractor): class ORFFM4IE(InfoExtractor): IE_NAME = 'orf:fm4' IE_DESC = 'radio FM4' - _VALID_URL = r'http://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' + _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' _TEST = { 'url': 'http://fm4.orf.at/player/20160110/IS/', @@ -222,7 +222,7 @@ class ORFFM4IE(InfoExtractor): class ORFIPTVIE(InfoExtractor): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' - _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' + _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' _TEST = { 'url': 'http://iptv.orf.at/stories/2275236/', diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index ec8876c28..229750665 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -65,7 +65,7 @@ class PatreonIE(InfoExtractor): request = sanitized_Request( 'https://www.patreon.com/processLogin', - compat_urllib_parse.urlencode(login_form).encode('utf-8') + compat_urllib_parse_urlencode(login_form).encode('utf-8') ) login_page = self._download_webpage(request, None, note='Logging in as %s' % username) diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py index 6e60e5fe9..f1008ae51 100644 --- a/youtube_dl/extractor/philharmoniedeparis.py +++ b/youtube_dl/extractor/philharmoniedeparis.py @@ -12,7 +12,7 @@ from ..utils import ( class PhilharmonieDeParisIE(InfoExtractor): IE_DESC = 'Philharmonie de Paris' - _VALID_URL = r'http://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' + _VALID_URL = r'https?://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', 'info_dict': { diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py index 788411ccc..6c8bbe1d9 100644 --- a/youtube_dl/extractor/photobucket.py +++ b/youtube_dl/extractor/photobucket.py @@ -8,7 +8,7 @@ from ..compat import compat_urllib_parse_unquote class PhotobucketIE(InfoExtractor): - _VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' + _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' _TEST = { 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index 2856af96f..57c875ef0 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -5,10 +5,10 @@ import re import os.path from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, sanitized_Request, + urlencode_postdata, ) @@ -40,7 +40,7 @@ class PlayedIE(InfoExtractor): self._sleep(2, video_id) - post = compat_urllib_parse.urlencode(data) + post = urlencode_postdata(data) headers = { b'Content-Type': b'application/x-www-form-urlencoded', } diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py index e360404f7..1e8096a25 100644 --- a/youtube_dl/extractor/playtvak.py +++ b/youtube_dl/extractor/playtvak.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_urlparse, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, @@ -106,7 +106,7 @@ class PlaytvakIE(InfoExtractor): }) info_url = compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) json_info = self._download_json( info_url, video_id, diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 12e1c2862..9aab77645 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -8,7 +8,6 @@ import collections from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, compat_urlparse, ) from ..utils import ( @@ -17,6 +16,7 @@ from ..utils import ( parse_duration, qualities, sanitized_Request, + urlencode_postdata, ) @@ -64,8 +64,8 @@ class PluralsightIE(PluralsightBaseIE): login_form = self._hidden_inputs(login_page) login_form.update({ - 'Username': username.encode('utf-8'), - 'Password': password.encode('utf-8'), + 'Username': username, + 'Password': password, }) post_url = self._search_regex( @@ -76,7 +76,7 @@ class PluralsightIE(PluralsightBaseIE): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) request = sanitized_Request( - post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + post_url, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') response = self._download_webpage( @@ -279,13 +279,18 @@ class PluralsightCourseIE(PluralsightBaseIE): course_id, 'Downloading course data JSON') entries = [] - for module in course_data: + for num, module in enumerate(course_data, 1): for clip in module.get('clips', []): player_parameters = clip.get('playerParameters') if not player_parameters: continue - entries.append(self.url_result( - '%s/training/player?%s' % (self._API_BASE, player_parameters), - 'Pluralsight')) + entries.append({ + '_type': 'url_transparent', + 'url': '%s/training/player?%s' % (self._API_BASE, player_parameters), + 'ie_key': PluralsightIE.ie_key(), + 'chapter': module.get('title'), + 'chapter_number': num, + 'chapter_id': module.get('moduleRef'), + }) return self.playlist_result(entries, course_id, title, description) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 3e15533e9..9894f3262 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -1,7 +1,10 @@ # encoding: utf-8 from __future__ import unicode_literals -from ..compat import compat_urllib_parse +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, +) from .common import InfoExtractor from ..utils import ( parse_duration, @@ -28,9 +31,10 @@ class Porn91IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id self._set_cookie('91porn.com', 'language', 'cn_CN') - webpage = self._download_webpage(url, video_id, 'get HTML content') + + webpage = self._download_webpage( + 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) if '作为游客,你每天只可观看10个视频' in webpage: raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) @@ -46,7 +50,7 @@ class Porn91IE(InfoExtractor): r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code') max_vid = self._search_regex( r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid') - url_params = compat_urllib_parse.urlencode({ + url_params = compat_urllib_parse_urlencode({ 'VID': file_id, 'mp4': '1', 'seccode': sec_code, @@ -54,8 +58,9 @@ class Porn91IE(InfoExtractor): }) info_cn = self._download_webpage( 'http://91porn.com/getfile.php?' + url_params, video_id, - 'get real video url') - video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url') + 'Downloading real video url') + video_url = compat_urllib_parse_unquote(self._search_regex( + r'file=([^&]+)&', info_cn, 'url')) duration = parse_duration(self._search_regex( r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False)) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 57c78ba52..39b53ecf6 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -12,7 +12,7 @@ from ..utils import ( class PornHdIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' + _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' _TEST = { 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'md5': '956b8ca569f7f4d8ec563e2c41598441', diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 5a55c25e7..407ea08d4 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -1,10 +1,12 @@ from __future__ import unicode_literals +import itertools import os import re from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlparse, @@ -12,6 +14,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + orderedSet, sanitized_Request, str_to_int, ) @@ -75,7 +78,7 @@ class PornHubIE(InfoExtractor): flashvars = self._parse_json( self._search_regex( - r'var\s+flashv1ars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), + r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), video_id) if flashvars: video_title = flashvars.get('video_title') @@ -149,9 +152,12 @@ class PornHubIE(InfoExtractor): class PornHubPlaylistBaseIE(InfoExtractor): def _extract_entries(self, webpage): return [ - self.url_result('http://www.pornhub.com/%s' % video_url, PornHubIE.ie_key()) - for video_url in set(re.findall( - r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) + self.url_result( + 'http://www.pornhub.com/%s' % video_url, + PornHubIE.ie_key(), video_title=title) + for video_url, title in orderedSet(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', + webpage)) ] def _real_extract(self, url): @@ -185,16 +191,31 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): class PornHubUserVideosIE(PornHubPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos' _TESTS = [{ - 'url': 'http://www.pornhub.com/users/rushandlia/videos', + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { - 'id': 'rushandlia', + 'id': 'zoe_ph', }, - 'playlist_mincount': 13, + 'playlist_mincount': 171, + }, { + 'url': 'http://www.pornhub.com/users/rushandlia/videos', + 'only_matching': True, }] def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage(url, user_id) + entries = [] + for page_num in itertools.count(1): + try: + webpage = self._download_webpage( + url, user_id, 'Downloading page %d' % page_num, + query={'page': page_num}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + break + page_entries = self._extract_entries(webpage) + if not page_entries: + break + entries.extend(page_entries) - return self.playlist_result(self._extract_entries(webpage), user_id) + return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py index 1a53fd71c..6b51e5c54 100644 --- a/youtube_dl/extractor/pornovoisines.py +++ b/youtube_dl/extractor/pornovoisines.py @@ -13,7 +13,7 @@ from ..utils import ( class PornoVoisinesIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py index 85aae9576..0c1024772 100644 --- a/youtube_dl/extractor/primesharetv.py +++ b/youtube_dl/extractor/primesharetv.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, sanitized_Request, + urlencode_postdata, ) @@ -42,7 +42,7 @@ class PrimeShareTVIE(InfoExtractor): self._sleep(wait_time, video_id) req = sanitized_Request( - url, compat_urllib_parse.urlencode(fields), headers) + url, urlencode_postdata(fields), headers) video_page = self._download_webpage( req, video_id, 'Downloading video page') diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py index d5357283a..f93bd19ff 100644 --- a/youtube_dl/extractor/promptfile.py +++ b/youtube_dl/extractor/promptfile.py @@ -4,11 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( determine_ext, ExtractorError, sanitized_Request, + urlencode_postdata, ) @@ -34,7 +34,7 @@ class PromptFileIE(InfoExtractor): expected=True) fields = self._hidden_inputs(webpage) - post = compat_urllib_parse.urlencode(fields) + post = urlencode_postdata(fields) req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') webpage = self._download_webpage( diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 670e6950f..07d49d489 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -5,9 +5,7 @@ import re from hashlib import sha1 from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, determine_ext, @@ -235,7 +233,7 @@ class ProSiebenSat1IE(InfoExtractor): client_name = 'kolibri-2.0.19-splec4' client_location = url - videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({ + videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({ 'access_token': access_token, 'client_location': client_location, 'client_name': client_name, @@ -256,7 +254,7 @@ class ProSiebenSat1IE(InfoExtractor): client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]) .encode('utf-8')).hexdigest() - sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({ + sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({ 'access_token': access_token, 'client_id': client_id, 'client_location': client_location, @@ -270,7 +268,7 @@ class ProSiebenSat1IE(InfoExtractor): client_location, source_ids_str, g, client_name]) .encode('utf-8')).hexdigest() - url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({ + url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({ 'access_token': access_token, 'client_id': client_id, 'client_location': client_location, diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 30a5f2de4..cc0416cb8 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class PyvideoIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' _TESTS = [ { diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 45a3c41c5..ff0af9543 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -18,7 +18,7 @@ from ..utils import ( class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', 'md5': '9ce1c1c8445f561506d2e3cfb0255705', @@ -172,7 +172,7 @@ class QQPlaylistBaseIE(InfoExtractor): class QQMusicSingerIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:singer' IE_DESC = 'QQ音乐 - 歌手' - _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' _TEST = { 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', 'info_dict': { @@ -217,7 +217,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', @@ -260,7 +260,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=toplist&p=global_123', @@ -314,7 +314,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' IE_DESC = 'QQ音乐 - 歌单' - _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=taoge&id=3462654915', diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index a4dc5c335..e36ce1aa1 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -18,7 +18,7 @@ from ..utils import ( class RaiTVIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', @@ -175,7 +175,7 @@ class RaiTVIE(InfoExtractor): class RaiIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index d6054d717..7ba41ba59 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -5,7 +5,7 @@ from ..utils import ExtractorError class RedTubeIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.redtube.com/66418', 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py index b17c2bfc0..fd50065d4 100644 --- a/youtube_dl/extractor/restudy.py +++ b/youtube_dl/extractor/restudy.py @@ -31,6 +31,7 @@ class RestudyIE(InfoExtractor): formats = self._extract_smil_formats( 'https://www.restudy.dk/awsmedia/SmilDirectory/video_%s.xml' % video_id, video_id) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py index 508758075..2c2c707bd 100644 --- a/youtube_dl/extractor/ringtv.py +++ b/youtube_dl/extractor/ringtv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class RingTVIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' _TEST = { 'url': 'http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30', 'md5': 'd25945f5df41cdca2d2587165ac28720', diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index 042bc8dab..9c89974e7 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -49,6 +49,7 @@ class RteIE(InfoExtractor): # f4m_url = server + relative_url f4m_url = json_string['shows'][0]['media:group'][0]['rte:server'] + json_string['shows'][0]['media:group'][0]['url'] f4m_formats = self._extract_f4m_formats(f4m_url, video_id) + self._sort_formats(f4m_formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 8a8c5d2a0..79af47715 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -62,7 +62,7 @@ def _decrypt_url(png): class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' - _VALID_URL = r'http://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', @@ -179,7 +179,7 @@ class RTVEInfantilIE(InfoExtractor): class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'http://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' + _VALID_URL = r'https?://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', @@ -209,6 +209,7 @@ class RTVELiveIE(InfoExtractor): png = self._download_webpage(png_url, video_id, 'Downloading url information') m3u8_url = _decrypt_url(png) formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index 7c9d4b0cd..4896d09d6 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -38,6 +38,7 @@ class RTVNHIE(InfoExtractor): item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) elif item.get('type') == '': formats.append({'url': item['file']}) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py index 0e470e73f..1f7c26299 100644 --- a/youtube_dl/extractor/ruhd.py +++ b/youtube_dl/extractor/ruhd.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class RUHDIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' _TEST = { 'url': 'http://www.ruhd.ru/play.php?vid=207', 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index c5c47d01e..9ca4ae147 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -122,7 +122,7 @@ class RutubeEmbedIE(InfoExtractor): class RutubeChannelIE(InfoExtractor): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' - _VALID_URL = r'http://rutube\.ru/tags/video/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/tags/video/1800/', 'info_dict': { @@ -156,7 +156,7 @@ class RutubeChannelIE(InfoExtractor): class RutubeMovieIE(RutubeChannelIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' - _VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' _TESTS = [] _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' @@ -174,7 +174,7 @@ class RutubeMovieIE(RutubeChannelIE): class RutubePersonIE(RutubeChannelIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' - _VALID_URL = r'http://rutube\.ru/video/person/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/video/person/313878/', 'info_dict': { diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index f7fe1fece..a2379eb04 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -14,7 +14,7 @@ class RUTVIE(InfoExtractor): IE_DESC = 'RUTV.RU' _VALID_URL = r'''(?x) https?://player\.(?:rutv\.ru|vgtrk\.com)/ - (?P<path>flash2v/container\.swf\?id= + (?P<path>flash\d+v/container\.swf\?id= |iframe/(?P<type>swf|video|live)/id/ |index/iframe/cast_id/) (?P<id>\d+)''' @@ -109,7 +109,7 @@ class RUTVIE(InfoExtractor): return mobj.group('url') mobj = re.search( - r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)', + r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', webpage) if mobj: return mobj.group('url') @@ -119,7 +119,7 @@ class RUTVIE(InfoExtractor): video_id = mobj.group('id') video_path = mobj.group('path') - if video_path.startswith('flash2v'): + if re.match(r'flash\d+v', video_path): video_type = 'video' elif video_path.startswith('iframe'): video_type = mobj.group('type') @@ -168,7 +168,7 @@ class RUTVIE(InfoExtractor): 'play_path': mobj.group('playpath'), 'app': mobj.group('app'), 'page_url': 'http://player.rutv.ru', - 'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22', + 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', 'rtmp_live': True, 'ext': 'flv', 'vbr': int(quality), diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 256396bb8..6ba91f202 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -75,16 +75,7 @@ class SafariBaseIE(InfoExtractor): class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'''(?x)https?:// - (?:www\.)?safaribooksonline\.com/ - (?: - library/view/[^/]+| - api/v1/book - )/ - (?P<course_id>[^/]+)/ - (?:chapter(?:-content)?/)? - (?P<part>part\d+)\.html - ''' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>part\d+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', @@ -97,9 +88,6 @@ class SafariIE(SafariBaseIE): 'upload_date': '20150724', 'uploader_id': 'stork', }, - }, { - 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', - 'only_matching': True, }, { # non-digits in course id 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', @@ -108,13 +96,18 @@ class SafariIE(SafariBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - course_id = mobj.group('course_id') - part = mobj.group('part') + video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part')) - webpage = self._download_webpage(url, '%s/%s' % (course_id, part)) - reference_id = self._search_regex(r'data-reference-id="([^"]+)"', webpage, 'kaltura reference id') - partner_id = self._search_regex(r'data-partner-id="([^"]+)"', webpage, 'kaltura widget id') - ui_id = self._search_regex(r'data-ui-id="([^"]+)"', webpage, 'kaltura uiconf id') + webpage = self._download_webpage(url, video_id) + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura widget id', group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura uiconf id', group='id') query = { 'wid': '_%s' % partner_id, @@ -125,7 +118,7 @@ class SafariIE(SafariBaseIE): if self.LOGGED_IN: kaltura_session = self._download_json( '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), - course_id, 'Downloading kaltura session JSON', + video_id, 'Downloading kaltura session JSON', 'Unable to download kaltura session JSON', fatal=False) if kaltura_session: session = kaltura_session.get('session') @@ -137,6 +130,23 @@ class SafariIE(SafariBaseIE): 'Kaltura') +class SafariApiIE(SafariBaseIE): + IE_NAME = 'safari:api' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>part\d+)\.html' + + _TEST = { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + part = self._download_json( + url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), + 'Downloading part JSON') + return self.url_result(part['web_url'], SafariIE.ie_key()) + + class SafariCourseIE(SafariBaseIE): IE_NAME = 'safari:course' IE_DESC = 'safaribooksonline.com online courses' @@ -168,7 +178,7 @@ class SafariCourseIE(SafariBaseIE): 'No chapters found for course %s' % course_id, expected=True) entries = [ - self.url_result(chapter, 'Safari') + self.url_result(chapter, SafariApiIE.ie_key()) for chapter in course_json['chapters']] course_title = course_json['title'] diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index d6ee2d9e2..96472fbc4 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + smuggle_url, + ExtractorError, +) class SBSIE(InfoExtractor): @@ -20,6 +24,9 @@ class SBSIE(InfoExtractor): 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', 'thumbnail': 're:http://.*\.jpg', 'duration': 308, + 'timestamp': 1408613220, + 'upload_date': '20140821', + 'uploader': 'SBSC', }, }, { 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', @@ -31,21 +38,29 @@ class SBSIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + player_params = self._download_json( + 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id) - webpage = self._download_webpage( - 'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id) - - player_params = self._parse_json( - self._search_regex( - r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'), - video_id) + error = player_params.get('error') + if error: + error_message = 'Sorry, The video you are looking for does not exist.' + video_data = error.get('results') or {} + error_code = error.get('errorCode') + if error_code == 'ComingSoon': + error_message = '%s is not yet available.' % video_data.get('title', '') + elif error_code in ('Forbidden', 'intranetAccessOnly'): + error_message = 'Sorry, This video cannot be accessed via this website' + elif error_code == 'Expired': + error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '') + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) urls = player_params['releaseUrls'] - theplatform_url = (urls.get('progressive') or urls.get('standard') or - urls.get('html') or player_params['relatedItemsURL']) + theplatform_url = (urls.get('progressive') or urls.get('html') or + urls.get('standard') or player_params['relatedItemsURL']) return { '_type': 'url_transparent', + 'ie_key': 'ThePlatform', 'id': video_id, - 'url': theplatform_url, + 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), } diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index dfd897ba3..356631700 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -12,7 +12,7 @@ from ..utils import ( class ScreencastIE(InfoExtractor): - _VALID_URL = r'https?://www\.screencast\.com/t/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://www.screencast.com/t/3ZEjQXlT', 'md5': '917df1c13798a3e96211dd1561fded83', @@ -53,8 +53,10 @@ class ScreencastIE(InfoExtractor): 'description': 'md5:7b9f393bc92af02326a5c5889639eab0', 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', } - }, - ] + }, { + 'url': 'http://screencast.com/t/aAB3iowa', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -94,8 +96,9 @@ class ScreencastIE(InfoExtractor): title = self._og_search_title(webpage, default=None) if title is None: title = self._html_search_regex( - [r'<b>Title:</b> ([^<]*)</div>', - r'class="tabSeperator">></span><span class="tabText">(.*?)<'], + [r'<b>Title:</b> ([^<]+)</div>', + r'class="tabSeperator">></span><span class="tabText">(.+?)<', + r'<title>([^<]+)'], webpage, 'title') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage, default=None) diff --git a/youtube_dl/extractor/screenjunkies.py b/youtube_dl/extractor/screenjunkies.py index f2af15f6b..dd0a6ba19 100644 --- a/youtube_dl/extractor/screenjunkies.py +++ b/youtube_dl/extractor/screenjunkies.py @@ -11,7 +11,7 @@ from ..utils import ( class ScreenJunkiesIE(InfoExtractor): - _VALID_URL = r'http://www.screenjunkies.com/video/(?P[^/]+?)(?:-(?P\d+))?(?:[/?#&]|$)' + _VALID_URL = r'https?://www.screenjunkies.com/video/(?P[^/]+?)(?:-(?P\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915', 'md5': '5c2b686bec3d43de42bde9ec047536b0', diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 4d3b58522..c5f474dd1 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor): ['arch', '', 'http://ussenate-f.akamaihd.net/'] ] _IE_NAME = 'senate.gov' - _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P.+)' + _VALID_URL = r'https?://www\.senate\.gov/isvp/?\?(?P.+)' _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 1178b7a27..d95ea06be 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, int_or_none, @@ -77,11 +77,12 @@ class ShahidIE(InfoExtractor): raise ExtractorError('This video is DRM protected.', expected=True) formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') + self._sort_formats(formats) video = self._download_json( '%s/%s/%s?%s' % ( api_vars['url'], api_vars['playerType'], api_vars['id'], - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'apiKey': 'sh@hid0nlin3', 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', })), diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 8eda3c864..e7e5f653e 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -3,17 +3,17 @@ from __future__ import unicode_literals import base64 from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, + urlencode_postdata, ) class SharedIE(InfoExtractor): IE_DESC = 'shared.sx and vivo.sx' - _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P[\da-z]{10})' + _VALID_URL = r'https?://(?:shared|vivo)\.sx/(?P[\da-z]{10})' _TESTS = [{ 'url': 'http://shared.sx/0060718775', @@ -45,7 +45,7 @@ class SharedIE(InfoExtractor): download_form = self._hidden_inputs(webpage) request = sanitized_Request( - url, compat_urllib_parse.urlencode(download_form)) + url, urlencode_postdata(download_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') video_page = self._download_webpage( diff --git a/youtube_dl/extractor/sharesix.py b/youtube_dl/extractor/sharesix.py index f1ea9bdb2..9cce5ceb4 100644 --- a/youtube_dl/extractor/sharesix.py +++ b/youtube_dl/extractor/sharesix.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( parse_duration, sanitized_Request, + urlencode_postdata, ) @@ -47,7 +47,7 @@ class ShareSixIE(InfoExtractor): fields = { 'method_free': 'Free' } - post = compat_urllib_parse.urlencode(fields) + post = urlencode_postdata(fields) req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index b2258a0f6..d03f1b1d4 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import sanitized_Request @@ -39,7 +39,7 @@ class SinaIE(InfoExtractor): ] def _extract_video(self, video_id): - data = compat_urllib_parse.urlencode({'vid': video_id}) + data = compat_urllib_parse_urlencode({'vid': video_id}) url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, video_id, 'Downloading video url') image_page = self._download_webpage( diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 015ef75f3..5c3fd0fec 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -7,12 +7,12 @@ import hashlib import uuid from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, unified_strdate, + urlencode_postdata, ) @@ -175,7 +175,7 @@ class SmotriIE(InfoExtractor): video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() request = sanitized_Request( - 'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form)) + 'http://smotri.com/video/view/url/bot/', urlencode_postdata(video_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') video = self._download_json(request, video_id, 'Downloading video JSON') @@ -338,7 +338,7 @@ class SmotriBroadcastIE(InfoExtractor): } request = sanitized_Request( - broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form)) + broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') broadcast_page = self._download_webpage( request, broadcast_id, 'Logging in and confirming age') diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index ea8fc258d..49e5d09ae 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, @@ -170,7 +170,7 @@ class SohuIE(InfoExtractor): if retries > 0: download_note += ' (retry #%d)' % retries part_info = self._parse_json(self._download_webpage( - 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)), + 'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)), video_id, download_note), video_id) video_url = part_info['url'] diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 1efb2b980..194dabc71 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -11,10 +11,9 @@ from .common import ( from ..compat import ( compat_str, compat_urlparse, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( - encode_dict, ExtractorError, int_or_none, unified_strdate, @@ -393,7 +392,7 @@ class SoundcloudUserIE(SoundcloudIE): query = COMMON_QUERY.copy() query['offset'] = 0 - next_href = base_url + '?' + compat_urllib_parse.urlencode(query) + next_href = base_url + '?' + compat_urllib_parse_urlencode(query) entries = [] for i in itertools.count(): @@ -424,7 +423,7 @@ class SoundcloudUserIE(SoundcloudIE): qs = compat_urlparse.parse_qs(parsed_next_href.query) qs.update(COMMON_QUERY) next_href = compat_urlparse.urlunparse( - parsed_next_href._replace(query=compat_urllib_parse.urlencode(qs, True))) + parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) return { '_type': 'playlist', @@ -460,7 +459,7 @@ class SoundcloudPlaylistIE(SoundcloudIE): if token: data_dict['secret_token'] = token - data = compat_urllib_parse.urlencode(data_dict) + data = compat_urllib_parse_urlencode(data_dict) data = self._download_json( base_url + data, playlist_id, 'Downloading playlist') @@ -500,7 +499,7 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): query['client_id'] = self._CLIENT_ID query['linked_partitioning'] = '1' query['offset'] = 0 - data = compat_urllib_parse.urlencode(encode_dict(query)) + data = compat_urllib_parse_urlencode(query) next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data) collected_results = 0 diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py index dfe50ed45..7e6783306 100644 --- a/youtube_dl/extractor/sport5.py +++ b/youtube_dl/extractor/sport5.py @@ -8,7 +8,7 @@ from ..utils import ExtractorError class Sport5IE(InfoExtractor): - _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P\d+)' + _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P\d+)' _TESTS = [ { 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 86d509ae5..4f0c66213 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -99,6 +99,7 @@ class SportBoxEmbedIE(InfoExtractor): webpage, 'hls file') formats = self._extract_m3u8_formats(hls, video_id, 'mp4') + self._sort_formats(formats) title = self._search_regex( r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title') diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/ssa.py index 13101c714..54d1843f2 100644 --- a/youtube_dl/extractor/ssa.py +++ b/youtube_dl/extractor/ssa.py @@ -8,7 +8,7 @@ from ..utils import ( class SSAIE(InfoExtractor): - _VALID_URL = r'http://ssa\.nls\.uk/film/(?P\d+)' + _VALID_URL = r'https?://ssa\.nls\.uk/film/(?P\d+)' _TEST = { 'url': 'http://ssa.nls.uk/film/3561', 'info_dict': { diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index 77841b946..712359885 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -4,8 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse -from ..utils import sanitized_Request +from ..utils import ( + sanitized_Request, + urlencode_postdata, +) class StreamcloudIE(InfoExtractor): @@ -35,7 +37,7 @@ class StreamcloudIE(InfoExtractor): (?:id="[^"]+"\s+)? value="([^"]*)" ''', orig_webpage) - post = compat_urllib_parse.urlencode(fields) + post = urlencode_postdata(fields) self._sleep(12, video_id) headers = { diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 399c3b8ee..2ab30e45f 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -19,20 +19,25 @@ class SVTBaseIE(InfoExtractor): video_info = info['video'] formats = [] for vr in video_info['videoReferences']: + player_type = vr.get('playerType') vurl = vr['url'] ext = determine_ext(vurl) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( vurl, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=vr.get('playerType'))) + m3u8_id=player_type, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( vurl + '?hdcore=3.3.0', video_id, - f4m_id=vr.get('playerType'))) + f4m_id=player_type, fatal=False)) + elif ext == 'mpd': + if player_type == 'dashhbbtv': + formats.extend(self._extract_mpd_formats( + vurl, video_id, mpd_id=player_type, fatal=False)) else: formats.append({ - 'format_id': vr.get('playerType'), + 'format_id': player_type, 'url': vurl, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index aa5964acb..f562aa6d3 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class SztvHuIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P[0-9]+)' _TEST = { 'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', 'md5': 'a6df607b11fb07d0e9f2ad94613375cb', diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index d1b7264b4..b49ab5f5b 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -16,7 +16,7 @@ from ..compat import compat_ord class TeamcocoIE(InfoExtractor): - _VALID_URL = r'http://teamcoco\.com/video/(?P[0-9]+)?/?(?P.*)' + _VALID_URL = r'https?://teamcoco\.com/video/(?P[0-9]+)?/?(?P.*)' _TESTS = [ { 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py index 4e860db0a..a29a64b6d 100644 --- a/youtube_dl/extractor/tele13.py +++ b/youtube_dl/extractor/tele13.py @@ -11,7 +11,7 @@ from ..utils import ( class Tele13IE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P[\w-]+)' + _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P[\w-]+)' _TESTS = [ { 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 2c8e9b941..4b4b740b4 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -5,8 +5,8 @@ import json from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -74,7 +74,7 @@ class TelecincoIE(InfoExtractor): info_el = self._download_xml(info_url, episode).find('./video/info') video_link = info_el.find('videoUrl/link').text - token_query = compat_urllib_parse.urlencode({'id': video_link}) + token_query = compat_urllib_parse_urlencode({'id': video_link}) token_info = self._download_json( embed_data['flashvars']['ov_tk'] + '?' + token_query, episode, @@ -82,6 +82,7 @@ class TelecincoIE(InfoExtractor): ) formats = self._extract_m3u8_formats( token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) return { 'id': embed_data['videoId'], diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py deleted file mode 100644 index 02a31a609..000000000 --- a/youtube_dl/extractor/tenplay.py +++ /dev/null @@ -1,90 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - float_or_none, -) - - -class TenPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ten(play)?\.com\.au/.+' - _TEST = { - 'url': 'http://tenplay.com.au/ten-insider/extra/season-2013/tenplay-tv-your-way', - 'info_dict': { - 'id': '2695695426001', - 'ext': 'flv', - 'title': 'TENplay: TV your way', - 'description': 'Welcome to a new TV experience. Enjoy a taste of the TENplay benefits.', - 'timestamp': 1380150606.889, - 'upload_date': '20130925', - 'uploader': 'TENplay', - }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } - } - - _video_fields = [ - 'id', 'name', 'shortDescription', 'longDescription', 'creationDate', - 'publishedDate', 'lastModifiedDate', 'customFields', 'videoStillURL', - 'thumbnailURL', 'referenceId', 'length', 'playsTotal', - 'playsTrailingWeek', 'renditions', 'captioning', 'startDate', 'endDate'] - - def _real_extract(self, url): - webpage = self._download_webpage(url, url) - video_id = self._html_search_regex( - r'videoID: "(\d+?)"', webpage, 'video_id') - api_token = self._html_search_regex( - r'apiToken: "([a-zA-Z0-9-_\.]+?)"', webpage, 'api_token') - title = self._html_search_regex( - r'', - webpage, 'title') - - json = self._download_json('https://api.brightcove.com/services/library?command=find_video_by_id&video_id=%s&token=%s&video_fields=%s' % (video_id, api_token, ','.join(self._video_fields)), title) - - formats = [] - for rendition in json['renditions']: - url = rendition['remoteUrl'] or rendition['url'] - protocol = 'rtmp' if url.startswith('rtmp') else 'http' - ext = 'flv' if protocol == 'rtmp' else rendition['videoContainer'].lower() - - if protocol == 'rtmp': - url = url.replace('&mp4:', '') - - tbr = int_or_none(rendition.get('encodingRate'), 1000) - - formats.append({ - 'format_id': '_'.join( - ['rtmp', rendition['videoContainer'].lower(), - rendition['videoCodec'].lower(), '%sk' % tbr]), - 'width': int_or_none(rendition['frameWidth']), - 'height': int_or_none(rendition['frameHeight']), - 'tbr': tbr, - 'filesize': int_or_none(rendition['size']), - 'protocol': protocol, - 'ext': ext, - 'vcodec': rendition['videoCodec'].lower(), - 'container': rendition['videoContainer'].lower(), - 'url': url, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': json['referenceId'], - 'title': json['name'], - 'description': json['shortDescription'] or json['longDescription'], - 'formats': formats, - 'thumbnails': [{ - 'url': json['videoStillURL'] - }, { - 'url': json['thumbnailURL'] - }], - 'thumbnail': json['videoStillURL'], - 'duration': float_or_none(json.get('length'), 1000), - 'timestamp': float_or_none(json.get('creationDate'), 1000), - 'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay', - 'view_count': int_or_none(json.get('playsTotal')), - } diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 9ee844684..3f54b2744 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P.+?)\.html' + _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P.+?)\.html' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 9a57b49df..6da701a39 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -8,13 +8,12 @@ import binascii import hashlib -from .common import InfoExtractor +from .once import OnceIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, ) from ..utils import ( - determine_ext, ExtractorError, float_or_none, int_or_none, @@ -29,26 +28,27 @@ default_ns = 'http://www.w3.org/2005/SMIL21/Language' _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) -class ThePlatformBaseIE(InfoExtractor): +class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): - meta = self._download_xml(smil_url, video_id, note=note) - error_element = find_xpath_attr( - meta, _x('.//smil:ref'), 'src', - 'http://link.theplatform.com/s/errorFiles/Unavailable.mp4') - if error_element is not None: + meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'}) + error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') + if error_element is not None and error_element.attrib['src'].startswith( + 'http://link.theplatform.com/s/errorFiles/Unavailable.'): raise ExtractorError(error_element.attrib['abstract'], expected=True) - formats = self._parse_smil_formats( + smil_formats = self._parse_smil_formats( meta, smil_url, video_id, namespace=default_ns, # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) - for _format in formats: - ext = determine_ext(_format['url']) - if ext == 'once': - _format['ext'] = 'mp4' + formats = [] + for _format in smil_formats: + if OnceIE.suitable(_format['url']): + formats.extend(self._extract_once_formats(_format['url'])) + else: + formats.append(_format) self._sort_formats(formats) @@ -76,13 +76,15 @@ class ThePlatformBaseIE(InfoExtractor): 'description': info['description'], 'thumbnail': info['defaultThumbnailUrl'], 'duration': int_or_none(info.get('duration'), 1000), + 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, + 'uploader': info.get('billingCode'), } class ThePlatformIE(ThePlatformBaseIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P[^/]+)/ - (?:(?P(?:(?:[^/]+/)+select/)?media/)|(?P(?:[^/\?]+/(?:swf|config)|onsite)/select/))? + (?:(?:(?:[^/]+/)+select/)?(?Pmedia/(?:guid/\d+/)?)|(?P(?:[^/\?]+/(?:swf|config)|onsite)/select/))? |theplatform:)(?P[^/\?&]+)''' _TESTS = [{ @@ -94,6 +96,9 @@ class ThePlatformIE(ThePlatformBaseIE): 'title': 'Blackberry\'s big, bold Z30', 'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', 'duration': 247, + 'timestamp': 1383239700, + 'upload_date': '20131031', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download @@ -107,6 +112,9 @@ class ThePlatformIE(ThePlatformBaseIE): 'ext': 'flv', 'description': 'md5:ac330c9258c04f9d7512cf26b9595409', 'title': 'Tesla Model S: A second step towards a cleaner motoring future', + 'timestamp': 1426176191, + 'upload_date': '20150312', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download @@ -119,13 +127,14 @@ class ThePlatformIE(ThePlatformBaseIE): 'ext': 'mp4', 'description': 'md5:644ad9188d655b742f942bf2e06b002d', 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', + 'uploader': 'EGSM', } }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', 'only_matching': True, }, { 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701', - 'md5': '734f3790fb5fc4903da391beeebc4836', + 'md5': 'fb96bb3d85118930a5b055783a3bd992', 'info_dict': { 'id': 'tdy_or_siri_150701', 'ext': 'mp4', @@ -135,7 +144,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1435752600, 'upload_date': '20150701', - 'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"], + 'uploader': 'NBCU-NEWS', }, }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 @@ -155,7 +164,7 @@ class ThePlatformIE(ThePlatformBaseIE): def hex_to_str(hex): return binascii.a2b_hex(hex) - relative_path = url.split('http://link.theplatform.com/s/')[1].split('?')[0] + relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1) clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path)) checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest() sig = flags + expiration_date + checksum + str_to_hex(sig_secret) @@ -171,10 +180,10 @@ class ThePlatformIE(ThePlatformBaseIE): if not provider_id: provider_id = 'dJ5BDC' - path = provider_id + path = provider_id + '/' if mobj.group('media'): - path += '/media' - path += '/' + video_id + path += mobj.group('media') + path += video_id qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query) if 'guid' in qs_dict: @@ -213,7 +222,7 @@ class ThePlatformIE(ThePlatformBaseIE): webpage, 'smil url', group='url') path = self._search_regex( r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path') - smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4&format=SMIL' + smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4' elif mobj.group('config'): config_url = url + '&form=json' config_url = config_url.replace('swf/', 'config/') @@ -223,9 +232,9 @@ class ThePlatformIE(ThePlatformBaseIE): release_url = config['releaseUrl'] else: release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path - smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m' + smil_url = release_url + '&formats=MPEG4&manifest=f4m' else: - smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path + smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path sig = smuggled_data.get('sig') if sig: @@ -250,7 +259,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): _TEST = { # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', - 'md5': '22d2b84f058d3586efcd99e57d59d314', + 'md5': '6e32495b5073ab414471b615c5ded394', 'info_dict': { 'id': 'n_hardball_5biden_140207', 'ext': 'mp4', @@ -280,7 +289,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): first_video_id = None duration = None for item in entry['media$content']: - smil_url = item['plfile$url'] + '&format=SMIL&mbr=true' + smil_url = item['plfile$url'] + '&mbr=true' cur_video_id = ThePlatformIE._match_id(smil_url) if first_video_id is None: first_video_id = cur_video_id diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py new file mode 100644 index 000000000..3e4e14031 --- /dev/null +++ b/youtube_dl/extractor/thescene.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..compat import compat_urlparse +from ..utils import qualities + + +class TheSceneIE(InfoExtractor): + _VALID_URL = r'https://thescene\.com/watch/[^/]+/(?P[^/#?]+)' + + _TEST = { + 'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear', + 'info_dict': { + 'id': '520e8faac2b4c00e3c6e5f43', + 'ext': 'mp4', + 'title': 'Narciso Rodriguez: Spring 2013 Ready-to-Wear', + 'display_id': 'narciso-rodriguez-spring-2013-ready-to-wear', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + player_url = compat_urlparse.urljoin( + url, + self._html_search_regex( + r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) + + player = self._download_webpage(player_url, display_id) + info = self._parse_json( + self._search_regex( + r'(?m)var\s+video\s+=\s+({.+?});$', player, 'info json'), + display_id) + + qualities_order = qualities(('low', 'high')) + formats = [{ + 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']), + 'url': f['src'], + 'quality': qualities_order(f['quality']), + } for f in info['sources'][0]] + self._sort_formats(formats) + + return { + 'id': info['id'], + 'display_id': display_id, + 'title': info['title'], + 'formats': formats, + 'thumbnail': info.get('poster_frame'), + } diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py new file mode 100644 index 000000000..ba1380abc --- /dev/null +++ b/youtube_dl/extractor/thestar.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import compat_parse_qs + + +class TheStarIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thestar\.com/(?:[^/]+/)*(?P.+)\.html' + _TEST = { + 'url': 'http://www.thestar.com/life/2016/02/01/mankind-why-this-woman-started-a-men-s-skincare-line.html', + 'md5': '2c62dd4db2027e35579fefb97a8b6554', + 'info_dict': { + 'id': '4732393888001', + 'ext': 'mp4', + 'title': 'Mankind: Why this woman started a men\'s skin care line', + 'description': 'Robert Cribb talks to Young Lee, the founder of Uncle Peter\'s MAN.', + 'uploader_id': '794267642001', + 'timestamp': 1454353482, + 'upload_date': '20160201', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/794267642001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 496f15d80..406f4a826 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -10,7 +10,7 @@ from ..utils import ( class THVideoIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P[0-9]+)' _TEST = { 'url': 'http://thvideo.tv/v/th1987/', 'md5': 'fa107b1f73817e325e9433505a70db50', diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py index e036b8cdf..c43cace24 100644 --- a/youtube_dl/extractor/tinypic.py +++ b/youtube_dl/extractor/tinypic.py @@ -9,7 +9,7 @@ from ..utils import ExtractorError class TinyPicIE(InfoExtractor): IE_NAME = 'tinypic' IE_DESC = 'tinypic.com videos' - _VALID_URL = r'http://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P[^&]+)&s=\d+' + _VALID_URL = r'https?://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P[^&]+)&s=\d+' _TESTS = [ { diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index 17add9543..abad3ff64 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -9,7 +9,7 @@ from ..compat import compat_parse_qs class TlcDeIE(InfoExtractor): IE_NAME = 'tlc.de' - _VALID_URL = r'http://www\.tlc\.de/(?:[^/]+/)*videos/(?P[^/?#]+)?(?:.*#(?P<id>\d+))?' + _VALID_URL = r'https?://www\.tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' _TEST = { 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 2756f56d3..2579ba8c6 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -41,7 +41,7 @@ class ToypicsIE(InfoExtractor): class ToypicsUserIE(InfoExtractor): IE_DESC = 'Toypics user profile' - _VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' + _VALID_URL = r'https?://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' _TEST = { 'url': 'http://videos.toypics.net/Mikey', 'info_dict': { diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 0e01b15fc..747370d12 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class TrailerAddictIE(InfoExtractor): _WORKING = False - _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' + _VALID_URL = r'(?:https?://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' _TEST = { 'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer', 'md5': '41365557f3c8c397d091da510e73ceb4', diff --git a/youtube_dl/extractor/trollvids.py b/youtube_dl/extractor/trollvids.py index d239949a6..657705623 100644 --- a/youtube_dl/extractor/trollvids.py +++ b/youtube_dl/extractor/trollvids.py @@ -7,7 +7,7 @@ from .nuevo import NuevoBaseIE class TrollvidsIE(NuevoBaseIE): - _VALID_URL = r'http://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' IE_NAME = 'trollvids' _TEST = { 'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff', diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 6d78b5dfe..7af233cd6 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -5,11 +5,11 @@ import codecs import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -41,7 +41,7 @@ class TubiTvIE(InfoExtractor): 'username': username, 'password': password, } - payload = compat_urllib_parse.urlencode(form_data).encode('utf-8') + payload = urlencode_postdata(form_data) request = sanitized_Request(self._LOGIN_URL, payload) request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_page = self._download_webpage( @@ -69,6 +69,7 @@ class TubiTvIE(InfoExtractor): apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu') m3u8_url = codecs.decode(apu, 'rot_13')[::-1] formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index f56b66d06..63b5d5924 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -5,7 +5,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + ExtractorError, int_or_none, + InAdvancePagedList, float_or_none, unescapeHTML, ) @@ -45,6 +47,19 @@ class TudouIE(InfoExtractor): _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' + # Translated from tudou/tools/TVCHelper.as in PortalPlayer_193.swf + # 0001, 0002 and 4001 are not included as they indicate temporary issues + TVC_ERRORS = { + '0003': 'The video is deleted or does not exist', + '1001': 'This video is unavailable due to licensing issues', + '1002': 'This video is unavailable as it\'s under review', + '1003': 'This video is unavailable as it\'s under review', + '3001': 'Password required', + '5001': 'This video is available in Mainland China only due to licensing issues', + '7001': 'This video is unavailable', + '8001': 'This video is unavailable due to licensing issues', + } + def _url_for_id(self, video_id, quality=None): info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id) if quality: @@ -62,6 +77,15 @@ class TudouIE(InfoExtractor): if youku_vcode: return self.url_result('youku:' + youku_vcode, ie='Youku') + if not item_data.get('itemSegs'): + tvc_code = item_data.get('tvcCode') + if tvc_code: + err_msg = self.TVC_ERRORS.get(tvc_code) + if err_msg: + raise ExtractorError('Tudou said: %s' % err_msg, expected=True) + raise ExtractorError('Unexpected error %s returned from Tudou' % tvc_code) + raise ExtractorError('Unxpected error returned from Tudou') + title = unescapeHTML(item_data['kw']) description = item_data.get('desc') thumbnail_url = item_data.get('pic') @@ -75,15 +99,16 @@ class TudouIE(InfoExtractor): quality = sorted(filter(lambda k: k.isdigit(), segments.keys()), key=lambda k: int(k))[-1] parts = segments[quality] - result = [] len_parts = len(parts) if len_parts > 1: self.to_screen('%s: found %s parts' % (video_id, len_parts)) - for part in parts: + + def part_func(partnum): + part = parts[partnum] part_id = part['k'] final_url = self._url_for_id(part_id, quality) ext = (final_url.split('?')[0]).split('.')[-1] - part_info = { + return [{ 'id': '%s' % part_id, 'url': final_url, 'ext': ext, @@ -97,12 +122,13 @@ class TudouIE(InfoExtractor): 'http_headers': { 'Referer': self._PLAYER_URL, }, - } - result.append(part_info) + }] + + entries = InAdvancePagedList(part_func, len_parts, 1) return { '_type': 'multi_video', - 'entries': result, + 'entries': entries, 'id': video_id, 'title': title, } diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 4f844706d..4d8b57111 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -8,7 +8,7 @@ from ..utils import int_or_none class TumblrIE(InfoExtractor): - _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' + _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', @@ -67,6 +67,34 @@ class TumblrIE(InfoExtractor): 'uploader_id': 'user32021558', }, 'add_ie': ['Vimeo'], + }, { + 'url': 'http://sutiblr.tumblr.com/post/139638707273', + 'md5': '2dd184b3669e049ba40563a7d423f95c', + 'info_dict': { + 'id': 'ir7qBEIKqvq', + 'ext': 'mp4', + 'title': 'Vine by sutiblr', + 'alt_title': 'Vine by sutiblr', + 'uploader': 'sutiblr', + 'uploader_id': '1198993975374495744', + 'upload_date': '20160220', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'add_ie': ['Vine'], + }, { + 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or', + 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72', + 'info_dict': { + 'id': '-7LnUPGlSo', + 'ext': 'mp4', + 'title': 'Video by victoriassecret', + 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat', + 'uploader_id': 'victoriassecret', + 'thumbnail': 're:^https?://.*\.jpg' + }, + 'add_ie': ['Instagram'], }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index 8322cc14d..ae4cfaec2 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import json +import re from .common import InfoExtractor from ..utils import ExtractorError @@ -27,10 +27,9 @@ class TuneInBaseIE(InfoExtractor): if not streams_url.startswith('http://'): streams_url = compat_urlparse.urljoin(url, streams_url) - stream_data = self._download_webpage( - streams_url, content_id, note='Downloading stream data') - streams = json.loads(self._search_regex( - r'\((.*)\);', stream_data, 'stream info'))['Streams'] + streams = self._download_json( + streams_url, content_id, note='Downloading stream data', + transform_source=lambda s: re.sub(r'^\s*\((.*)\);\s*$', r'\1', s))['Streams'] is_live = None formats = [] diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 1457e524e..86bb7915d 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -14,7 +14,7 @@ from ..utils import ( class TV2IE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' _TEST = { 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { @@ -100,7 +100,7 @@ class TV2IE(InfoExtractor): class TV2ArticleIE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', 'info_dict': { diff --git a/youtube_dl/extractor/tv3.py b/youtube_dl/extractor/tv3.py new file mode 100644 index 000000000..3867ec90d --- /dev/null +++ b/youtube_dl/extractor/tv3.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TV3IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tv3\.co\.nz/(?P<id>[^/]+)/tabid/\d+/articleID/\d+/MCat/\d+/Default\.aspx' + _TEST = { + 'url': 'http://www.tv3.co.nz/MOTORSPORT-SRS-SsangYong-Hampton-Downs-Round-3/tabid/3692/articleID/121615/MCat/2915/Default.aspx', + 'info_dict': { + 'id': '4659127992001', + 'ext': 'mp4', + 'title': 'CRC Motorsport: SRS SsangYong Hampton Downs Round 3 - S2015 Ep3', + 'description': 'SsangYong Racing Series returns for Round 3 with drivers from New Zealand and Australia taking to the grid at Hampton Downs raceway.', + 'uploader_id': '3812193411001', + 'upload_date': '20151213', + 'timestamp': 1449975272, + }, + 'expected_warnings': [ + 'Failed to download MPD manifest' + ], + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/3812193411001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_id = self._search_regex(r'<param\s*name="@videoPlayer"\s*value="(\d+)"', webpage, 'brightcove id') + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index 3a4f393fc..4065354dd 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -11,7 +11,7 @@ from ..utils import ( class TVCIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' _TEST = { 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702', 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708', @@ -64,7 +64,7 @@ class TVCIE(InfoExtractor): class TVCArticleIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)' + _VALID_URL = r'https?://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)' _TESTS = [{ 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/', 'info_dict': { diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index b4683de54..df70a6b23 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -13,7 +13,7 @@ from ..utils import ( class TVPlayIE(InfoExtractor): IE_DESC = 'TV3Play and related services' - _VALID_URL = r'''(?x)http://(?:www\.)? + _VALID_URL = r'''(?x)https?://(?:www\.)? (?:tvplay\.lv/parraides| tv3play\.lt/programos| play\.tv3\.lt/programos| diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 958bf8fff..36ee1adff 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -9,18 +9,18 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( - encode_dict, ExtractorError, int_or_none, orderedSet, parse_duration, parse_iso8601, sanitized_Request, + urlencode_postdata, ) @@ -82,7 +82,7 @@ class TwitchBaseIE(InfoExtractor): post_url = compat_urlparse.urljoin(redirect_url, post_url) request = sanitized_Request( - post_url, compat_urllib_parse.urlencode(encode_dict(login_form)).encode('utf-8')) + post_url, urlencode_postdata(login_form)) request.add_header('Referer', redirect_url) response = self._download_webpage( request, None, 'Logging in as %s' % username) @@ -250,7 +250,7 @@ class TwitchVodIE(TwitchItemBaseIE): formats = self._extract_m3u8_formats( '%s/vod/%s?%s' % ( self._USHER_BASE, item_id, - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'allow_source': 'true', 'allow_audio_only': 'true', 'allow_spectre': 'true', @@ -299,9 +299,10 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): # is completely broken on the twitch side. It simply ignores # a limit and returns the whole offset number of videos. # Working around by just requesting all videos at once. + # Upd: pagination bug was fixed by twitch on 15.03.2016. if not broken_paging_detected and total and len(page_entries) > limit: self.report_warning( - 'Twitch paging is broken on twitch side, requesting all videos at once', + 'Twitch pagination is broken on twitch side, requesting all videos at once', channel_id) broken_paging_detected = True offset = total @@ -441,7 +442,7 @@ class TwitchStreamIE(TwitchBaseIE): } formats = self._extract_m3u8_formats( '%s/api/channel/hls/%s.m3u8?%s' - % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)), + % (self._USHER_BASE, channel_id, compat_urllib_parse_urlencode(query)), channel_id, 'mp4') self._prefer_source(formats) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index e70b2ab3c..1f32ea2eb 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -102,6 +102,9 @@ class TwitterCardIE(TwitterBaseIE): r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'), video_id) + if config.get('source_type') == 'vine': + return self.url_result(config['player_url'], 'Vine') + def _search_dimensions_in_video_url(a_format, video_url): m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) if m: @@ -110,10 +113,9 @@ class TwitterCardIE(TwitterBaseIE): 'height': int(m.group('height')), }) - playlist = config.get('playlist') - if playlist: - video_url = playlist[0]['source'] + video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') + if video_url: f = { 'url': video_url, } @@ -185,7 +187,6 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 12.922, 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', @@ -247,6 +248,18 @@ class TwitterIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, { + 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', + 'md5': '89a15ed345d13b86e9a5a5e051fa308a', + 'info_dict': { + 'id': 'MIOxnrUteUd', + 'ext': 'mp4', + 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', + 'uploader': 'TAKUMA', + 'uploader_id': '1004126642786242560', + 'upload_date': '20140615', + }, + 'add_ie': ['Vine'], }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/ubu.py b/youtube_dl/extractor/ubu.py index d50237758..1d52cbc98 100644 --- a/youtube_dl/extractor/ubu.py +++ b/youtube_dl/extractor/ubu.py @@ -10,7 +10,7 @@ from ..utils import ( class UbuIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html' + _VALID_URL = r'https?://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html' _TEST = { 'url': 'http://ubu.com/film/her_noise.html', 'md5': '138d5652618bf0f03878978db9bef1ee', diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index f5b5e7fd6..d1e6f2703 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -1,23 +1,38 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_request, + compat_urlparse, ) from ..utils import ( + determine_ext, + extract_attributes, ExtractorError, float_or_none, int_or_none, sanitized_Request, unescapeHTML, + urlencode_postdata, ) class UdemyIE(InfoExtractor): IE_NAME = 'udemy' - _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + www\.udemy\.com/ + (?: + [^#]+\#/lecture/| + lecture/view/?\?lectureId=| + [^/]+/learn/v4/t/lecture/ + ) + (?P<id>\d+) + ''' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' _NETRC_MACHINE = 'udemy' @@ -33,34 +48,52 @@ class UdemyIE(InfoExtractor): 'duration': 579.29, }, 'skip': 'Requires udemy account credentials', + }, { + # new URL schema + 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906', + 'only_matching': True, }] - def _enroll_course(self, webpage, course_id): + def _extract_course_info(self, webpage, video_id): + course = self._parse_json( + unescapeHTML(self._search_regex( + r'ng-init=["\'].*\bcourse=({.+?});', webpage, 'course', default='{}')), + video_id, fatal=False) or {} + course_id = course.get('id') or self._search_regex( + (r'"id"\s*:\s*(\d+)', r'data-course-id=["\'](\d+)'), + webpage, 'course id') + return course_id, course.get('title') + + def _enroll_course(self, base_url, webpage, course_id): + def combine_url(base_url, url): + return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url + checkout_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1', + r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1', webpage, 'checkout url', group='url', default=None)) if checkout_url: raise ExtractorError( 'Course %s is not free. You have to pay for it before you can download. ' - 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True) + 'Use this URL to confirm purchase: %s' + % (course_id, combine_url(base_url, checkout_url)), + expected=True) enroll_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', + r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1', webpage, 'enroll url', group='url', default=None)) if enroll_url: - webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') + webpage = self._download_webpage( + combine_url(base_url, enroll_url), + course_id, 'Enrolling in the course') if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) def _download_lecture(self, course_id, lecture_id): return self._download_json( 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( - course_id, lecture_id, compat_urllib_parse.urlencode({ - 'video_only': '', - 'auto_play': '', - 'fields[lecture]': 'title,description,asset', + course_id, lecture_id, compat_urllib_parse_urlencode({ + 'fields[lecture]': 'title,description,view_html,asset', 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', - 'instructorPreviewMode': 'False', })), lecture_id, 'Downloading lecture JSON') @@ -75,7 +108,7 @@ class UdemyIE(InfoExtractor): error_str += ' - %s' % error_data.get('formErrors') raise ExtractorError(error_str, expected=True) - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): + def _download_json(self, url_or_request, *args, **kwargs): headers = { 'X-Udemy-Snail-Case': 'true', 'X-Requested-With': 'XMLHttpRequest', @@ -93,7 +126,7 @@ class UdemyIE(InfoExtractor): else: url_or_request = sanitized_Request(url_or_request, headers=headers) - response = super(UdemyIE, self)._download_json(url_or_request, video_id, note) + response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs) self._handle_error(response) return response @@ -118,12 +151,12 @@ class UdemyIE(InfoExtractor): login_form = self._form_hidden_inputs('login-form', login_popup) login_form.update({ - 'email': username.encode('utf-8'), - 'password': password.encode('utf-8'), + 'email': username, + 'password': password, }) request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Referer', self._ORIGIN_URL) request.add_header('Origin', self._ORIGIN_URL) @@ -143,15 +176,14 @@ class UdemyIE(InfoExtractor): webpage = self._download_webpage(url, lecture_id) - course_id = self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id') + course_id, _ = self._extract_course_info(webpage, lecture_id) try: lecture = self._download_lecture(course_id, lecture_id) except ExtractorError as e: # Error could possibly mean we are not enrolled in the course if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) lecture = self._download_lecture(course_id, lecture_id) else: raise @@ -161,12 +193,12 @@ class UdemyIE(InfoExtractor): asset = lecture['asset'] - asset_type = asset.get('assetType') or asset.get('asset_type') + asset_type = asset.get('asset_type') or asset.get('assetType') if asset_type != 'Video': raise ExtractorError( 'Lecture %s is not a video' % lecture_id, expected=True) - stream_url = asset.get('streamUrl') or asset.get('stream_url') + stream_url = asset.get('stream_url') or asset.get('streamUrl') if stream_url: youtube_url = self._search_regex( r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None) @@ -174,43 +206,92 @@ class UdemyIE(InfoExtractor): return self.url_result(youtube_url, 'Youtube') video_id = asset['id'] - thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') + thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl') duration = float_or_none(asset.get('data', {}).get('duration')) - outputs = asset.get('data', {}).get('outputs', {}) formats = [] - for format_ in asset.get('download_urls', {}).get('Video', []): - video_url = format_.get('file') - if not video_url: - continue - format_id = format_.get('label') - f = { - 'url': format_['file'], - 'height': int_or_none(format_id), - } - if format_id: - # Some videos contain additional metadata (e.g. - # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) - output = outputs.get(format_id) - if isinstance(output, dict): - f.update({ - 'format_id': '%sp' % (output.get('label') or format_id), - 'width': int_or_none(output.get('width')), - 'height': int_or_none(output.get('height')), - 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), - 'vcodec': output.get('video_codec'), - 'fps': int_or_none(output.get('frame_rate')), - 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), - 'acodec': output.get('audio_codec'), - 'asr': int_or_none(output.get('audio_sample_rate')), - 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), - 'filesize': int_or_none(output.get('file_size_in_bytes')), - }) - else: - f['format_id'] = '%sp' % format_id - formats.append(f) - self._sort_formats(formats) + def extract_output_format(src): + return { + 'url': src['url'], + 'format_id': '%sp' % (src.get('height') or format_id), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), + 'vcodec': src.get('video_codec'), + 'fps': int_or_none(src.get('frame_rate')), + 'abr': int_or_none(src.get('audio_bitrate_in_kbps')), + 'acodec': src.get('audio_codec'), + 'asr': int_or_none(src.get('audio_sample_rate')), + 'tbr': int_or_none(src.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(src.get('file_size_in_bytes')), + } + + outputs = asset.get('data', {}).get('outputs') + if not isinstance(outputs, dict): + outputs = {} + + def add_output_format_meta(f, key): + output = outputs.get(key) + if isinstance(output, dict): + output_format = extract_output_format(output) + output_format.update(f) + return output_format + return f + + download_urls = asset.get('download_urls') + if isinstance(download_urls, dict): + video = download_urls.get('Video') + if isinstance(video, list): + for format_ in video: + video_url = format_.get('file') + if not video_url: + continue + format_id = format_.get('label') + f = { + 'url': format_['file'], + 'format_id': '%sp' % format_id, + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + f = add_output_format_meta(f, format_id) + formats.append(f) + + view_html = lecture.get('view_html') + if view_html: + view_html_urls = set() + for source in re.findall(r'<source[^>]+>', view_html): + attributes = extract_attributes(source) + src = attributes.get('src') + if not src: + continue + res = attributes.get('data-res') + height = int_or_none(res) + if src in view_html_urls: + continue + view_html_urls.add(src) + if attributes.get('type') == 'application/x-mpegURL' or determine_ext(src) == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in m3u8_formats: + m = re.search(r'/hls_(?P<height>\d{3,4})_(?P<tbr>\d{2,})/', f['url']) + if m: + if not f.get('height'): + f['height'] = int(m.group('height')) + if not f.get('tbr'): + f['tbr'] = int(m.group('tbr')) + formats.extend(m3u8_formats) + else: + formats.append(add_output_format_meta({ + 'url': src, + 'format_id': '%dp' % height if height else None, + 'height': height, + }, res)) + + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { 'id': video_id, @@ -224,7 +305,7 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://www\.udemy\.com/(?P<id>[\da-z-]+)' + _VALID_URL = r'https?://www\.udemy\.com/(?P<id>[^/?#&]+)' _TESTS = [] @classmethod @@ -236,29 +317,34 @@ class UdemyCourseIE(UdemyIE): webpage = self._download_webpage(url, course_path) - response = self._download_json( - 'https://www.udemy.com/api-1.1/courses/%s' % course_path, - course_path, 'Downloading course JSON') + course_id, title = self._extract_course_info(webpage, course_path) - course_id = response['id'] - course_title = response.get('title') - - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) response = self._download_json( - 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, - course_id, 'Downloading course curriculum') + 'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id, + course_id, 'Downloading course curriculum', query={ + 'fields[chapter]': 'title,object_index', + 'fields[lecture]': 'title,asset', + 'page_size': '1000', + }) entries = [] - chapter, chapter_number = None, None - for asset in response: - asset_type = asset.get('assetType') or asset.get('asset_type') - if asset_type == 'Video': - asset_id = asset.get('id') - if asset_id: + chapter, chapter_number = [None] * 2 + for entry in response['results']: + clazz = entry.get('_class') + if clazz == 'lecture': + asset = entry.get('asset') + if isinstance(asset, dict): + asset_type = asset.get('asset_type') or asset.get('assetType') + if asset_type != 'Video': + continue + lecture_id = entry.get('id') + if lecture_id: entry = { '_type': 'url_transparent', - 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), + 'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']), + 'title': entry.get('title'), 'ie_key': UdemyIE.ie_key(), } if chapter_number: @@ -266,8 +352,8 @@ class UdemyCourseIE(UdemyIE): if chapter: entry['chapter'] = chapter entries.append(entry) - elif asset.get('type') == 'chapter': - chapter_number = asset.get('index') or asset.get('object_index') - chapter = asset.get('title') + elif clazz == 'chapter': + chapter_number = entry.get('object_index') + chapter = entry.get('title') - return self.playlist_result(entries, course_id, course_title) + return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 594bee4f9..66d9f1bf3 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -7,7 +7,7 @@ from ..utils import qualities class UnistraIE(InfoExtractor): - _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)' + _VALID_URL = r'https?://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index 3794bcded..dff1bb702 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -2,18 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( ExtractorError, sanitized_Request, + urlencode_postdata, ) class Vbox7IE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)' _TEST = { 'url': 'http://vbox7.com/play:249bb972c2', 'md5': '99f65c0c9ef9b682b97313e052734c3f', @@ -48,7 +46,7 @@ class Vbox7IE(InfoExtractor): webpage, 'title').split('/')[0].strip() info_url = 'http://vbox7.com/play/magare.do' - data = compat_urllib_parse.urlencode({'as3': '1', 'vid': video_id}) + data = urlencode_postdata({'as3': '1', 'vid': video_id}) info_request = sanitized_Request(info_url, data) info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage') diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 9633f7ffe..23ce0a0d1 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -12,7 +12,7 @@ from ..utils import ( class VeohIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py index a0c59a2e0..cb64ae0bd 100644 --- a/youtube_dl/extractor/vesti.py +++ b/youtube_dl/extractor/vesti.py @@ -10,7 +10,7 @@ from .rutv import RUTVIE class VestiIE(InfoExtractor): IE_DESC = 'Вести.Ru' - _VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P<id>.+)' + _VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P<id>.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 152fef42e..147480f64 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -152,7 +152,7 @@ class VevoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id + json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id response = self._download_json( json_url, video_id, 'Downloading video info', 'Unable to download info') video_info = response.get('video') or {} diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index e148b1ef5..b11cd254c 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -214,7 +214,7 @@ class VGTVIE(XstreamIE): class BTArticleIE(InfoExtractor): IE_NAME = 'bt:article' IE_DESC = 'Bergens Tidende Articles' - _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' + _VALID_URL = r'https?://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', 'md5': '2acbe8ad129b3469d5ae51b1158878df', @@ -241,7 +241,7 @@ class BTArticleIE(InfoExtractor): class BTVestlendingenIE(InfoExtractor): IE_NAME = 'bt:vestlendingen' IE_DESC = 'Bergens Tidende - Vestlendingen' - _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 6bfbd4d85..8d92aee87 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -93,7 +93,7 @@ class ViddlerIE(InfoExtractor): headers = {'Referer': 'http://static.cdn-ec.viddler.com/js/arpeggio/v2/embed.html'} request = sanitized_Request( 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?%s' - % compat_urllib_parse.urlencode(query), None, headers) + % compat_urllib_parse_urlencode(query), None, headers) data = self._download_json(request, video_id)['video'] formats = [] diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py index 0bd1e1eec..04e95c66e 100644 --- a/youtube_dl/extractor/videomore.py +++ b/youtube_dl/extractor/videomore.py @@ -111,6 +111,7 @@ class VideomoreIE(InfoExtractor): video_url = xpath_text(video, './/video_url', 'video url', fatal=True) formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds') + self._sort_formats(formats) data = self._download_json( 'http://videomore.ru/video/tracks/%s.json' % video_id, diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index 2cd36508a..0f798711b 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -14,7 +14,7 @@ class VideoTtIE(InfoExtractor): _WORKING = False ID_NAME = 'video.tt' IE_DESC = 'video.tt - Your True Tube' - _VALID_URL = r'http://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})' + _VALID_URL = r'https?://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})' _TESTS = [{ 'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8', diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index c76c20614..6645c6186 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -50,6 +50,7 @@ class VierIE(InfoExtractor): playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename) formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4') + self._sort_formats(formats) title = self._og_search_title(webpage, default=display_id) description = self._og_search_description(webpage, default=None) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 315984bf9..a4f914d14 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -15,7 +15,7 @@ from ..utils import ( class ViideaIE(InfoExtractor): - _VALID_URL = r'''(?x)http://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: videolectures\.net| flexilearn\.viidea\.net| presentations\.ocwconsortium\.org| @@ -151,6 +151,7 @@ class ViideaIE(InfoExtractor): smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) + self._sort_formats(info['formats']) info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) if multipart: diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 71c30d2cd..707a5735a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -12,7 +12,6 @@ from ..compat import ( ) from ..utils import ( determine_ext, - encode_dict, ExtractorError, InAdvancePagedList, int_or_none, @@ -42,13 +41,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): self.report_login() webpage = self._download_webpage(self._LOGIN_URL, None, False) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata(encode_dict({ + data = urlencode_postdata({ 'action': 'login', 'email': username, 'password': password, 'service': 'vimeo', 'token': token, - })) + }) login_request = sanitized_Request(self._LOGIN_URL, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_request.add_header('Referer', self._LOGIN_URL) @@ -255,10 +254,10 @@ class VimeoIE(VimeoBaseInfoExtractor): if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata(encode_dict({ + data = urlencode_postdata({ 'password': password, 'token': token, - })) + }) if url.startswith('http://'): # vimeo only supports https now, but the user can give an http url url = url.replace('http://', 'https://') @@ -274,7 +273,7 @@ class VimeoIE(VimeoBaseInfoExtractor): password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') - data = urlencode_postdata(encode_dict({'password': password})) + data = urlencode_postdata({'password': password}) pass_url = url + '/check-password' password_request = sanitized_Request(pass_url, data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') @@ -575,7 +574,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): token, vuid = self._extract_xsrft_and_vuid(webpage) fields['token'] = token fields['password'] = password - post = urlencode_postdata(encode_dict(fields)) + post = urlencode_postdata(fields) password_path = self._search_regex( r'action="([^"]+)"', login_form, 'password URL') password_url = compat_urlparse.urljoin(page_url, password_path) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index d560a4b5e..67220f1b7 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -5,10 +5,7 @@ import re import json from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -17,6 +14,7 @@ from ..utils import ( str_to_int, unescapeHTML, unified_strdate, + urlencode_postdata, ) from .vimeo import VimeoIE from .pladform import PladformIE @@ -204,7 +202,7 @@ class VKIE(InfoExtractor): request = sanitized_Request( 'https://login.vk.com/?act=login', - compat_urllib_parse.urlencode(login_form).encode('utf-8')) + urlencode_postdata(login_form)) login_page = self._download_webpage( request, None, note='Logging in as %s' % username) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 9e2aa58bd..baf39bb2c 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -7,7 +7,7 @@ from ..utils import ( float_or_none, int_or_none, ) -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode class VLiveIE(InfoExtractor): @@ -43,7 +43,7 @@ class VLiveIE(InfoExtractor): playinfo = self._download_json( 'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s' - % compat_urllib_parse.urlencode({ + % compat_urllib_parse_urlencode({ 'videoId': long_video_id, 'key': key, 'ptc': 'http', @@ -64,7 +64,7 @@ class VLiveIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) creator = self._html_search_regex( - r'<div[^>]+class="info_area"[^>]*>\s*<strong[^>]+class="name"[^>]*>([^<]+)</strong>', + r'<div[^>]+class="info_area"[^>]*>\s*<a\s+[^>]*>([^<]+)', webpage, 'creator', fatal=False) view_count = int_or_none(playinfo.get('meta', {}).get('count')) diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index a97995a6d..a938a4007 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, NO_DEFAULT, sanitized_Request, + urlencode_postdata, ) @@ -38,7 +38,7 @@ class VodlockerIE(InfoExtractor): if fields['op'] == 'download1': self._sleep(3, video_id) # they do detect when requests happen too fast! - post = compat_urllib_parse.urlencode(fields) + post = urlencode_postdata(fields) req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') webpage = self._download_webpage( diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py new file mode 100644 index 000000000..0c6b1f030 --- /dev/null +++ b/youtube_dl/extractor/voxmedia.py @@ -0,0 +1,132 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class VoxMediaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:theverge|vox|sbnation|eater|polygon|curbed|racked)\.com/(?:[^/]+/)*(?P<id>[^/?]+)' + _TESTS = [{ + 'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of', + 'md5': '73856edf3e89a711e70d5cf7cb280b37', + 'info_dict': { + 'id': '11eXZobjrG8DCSTgrNjVinU-YmmdYjhe', + 'ext': 'mp4', + 'title': 'Google\'s new material design direction', + 'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2', + } + }, { + # data-ooyala-id + 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', + 'md5': 'd744484ff127884cd2ba09e3fa604e4b', + 'info_dict': { + 'id': 'RkZXU4cTphOCPDMZg5oEounJyoFI0g-B', + 'ext': 'mp4', + 'title': 'The Nexus 6: hands-on with Google\'s phablet', + 'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af', + } + }, { + # volume embed + 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', + 'md5': '375c483c5080ab8cd85c9c84cfc2d1e4', + 'info_dict': { + 'id': 'wydzk3dDpmRz7PQoXRsTIX6XTkPjYL0b', + 'ext': 'mp4', + 'title': 'The new frontier of LGBTQ civil rights, explained', + 'description': 'md5:0dc58e94a465cbe91d02950f770eb93f', + } + }, { + # youtube embed + 'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance', + 'md5': '83b3080489fb103941e549352d3e0977', + 'info_dict': { + 'id': 'FcNHTJU1ufM', + 'ext': 'mp4', + 'title': 'How "the robot" became the greatest novelty dance of all time', + 'description': 'md5:b081c0d588b8b2085870cda55e6da176', + 'upload_date': '20160324', + 'uploader_id': 'voxdotcom', + 'uploader': 'Vox', + } + }, { + # SBN.VideoLinkset.entryGroup multiple ooyala embeds + 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', + 'info_dict': { + 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok', + 'title': '25 lies you will tell yourself on National Signing Day', + 'description': 'It\'s the most self-delusional time of the year, and everyone\'s gonna tell the same lies together!', + }, + 'playlist': [{ + 'md5': '721fededf2ab74ae4176c8c8cbfe092e', + 'info_dict': { + 'id': 'p3cThlMjE61VDi_SD9JlIteSNPWVDBB9', + 'ext': 'mp4', + 'title': 'Buddy Hield vs Steph Curry (and the world)', + 'description': 'Let’s dissect only the most important Final Four storylines.', + }, + }, { + 'md5': 'bf0c5cc115636af028be1bab79217ea9', + 'info_dict': { + 'id': 'BmbmVjMjE6esPHxdALGubTrouQ0jYLHj', + 'ext': 'mp4', + 'title': 'Chasing Cinderella 2016: Syracuse basketball', + 'description': 'md5:e02d56b026d51aa32c010676765a690d', + }, + }], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = compat_urllib_parse_unquote(self._download_webpage(url, display_id)) + + def create_entry(provider_video_id, provider_video_type, title=None, description=None): + return { + '_type': 'url_transparent', + 'url': provider_video_id if provider_video_type == 'youtube' else '%s:%s' % (provider_video_type, provider_video_id), + 'title': title or self._og_search_title(webpage), + 'description': description or self._og_search_description(webpage), + } + + entries = [] + entries_data = self._search_regex([ + r'Chorus\.VideoContext\.addVideo\((\[{.+}\])\);', + r'var\s+entry\s*=\s*({.+});', + r'SBN\.VideoLinkset\.entryGroup\(\s*(\[.+\])', + ], webpage, 'video data', default=None) + if entries_data: + entries_data = self._parse_json(entries_data, display_id) + if isinstance(entries_data, dict): + entries_data = [entries_data] + for video_data in entries_data: + provider_video_id = video_data.get('provider_video_id') + provider_video_type = video_data.get('provider_video_type') + if provider_video_id and provider_video_type: + entries.append(create_entry( + provider_video_id, provider_video_type, + video_data.get('title'), video_data.get('description'))) + + provider_video_id = self._search_regex( + r'data-ooyala-id="([^"]+)"', webpage, 'ooyala id', default=None) + if provider_video_id: + entries.append(create_entry(provider_video_id, 'ooyala')) + + volume_uuid = self._search_regex( + r'data-volume-uuid="([^"]+)"', webpage, 'volume uuid', default=None) + if volume_uuid: + volume_webpage = self._download_webpage( + 'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid) + video_data = self._parse_json(self._search_regex( + r'Volume\.createVideo\(({.+})\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid) + for provider_video_type in ('ooyala', 'youtube'): + provider_video_id = video_data.get('%s_id' % provider_video_type) + if provider_video_id: + description = video_data.get('description_long') or video_data.get('description_short') + entries.append(create_entry( + provider_video_id, provider_video_type, video_data.get('title_short'), description)) + break + + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries, display_id, self._og_search_title(webpage), self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index 149e36467..10ca6acb1 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -15,7 +15,7 @@ from ..utils import ( class VubeIE(InfoExtractor): IE_NAME = 'vube' IE_DESC = 'Vube.com' - _VALID_URL = r'http://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b' + _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b' _TESTS = [ { diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index a6d9b5fee..eaa888f00 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -14,7 +14,7 @@ from ..utils import ( class VuClipIE(InfoExtractor): - _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)' _TEST = { 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index 24efbd6e6..8b9488340 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -11,7 +11,7 @@ from ..utils import ( class WallaIE(InfoExtractor): - _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' + _VALID_URL = r'https?://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' _TEST = { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', 'info_dict': { diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 37cf3d309..5227bb5ad 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -12,7 +12,7 @@ from ..utils import ( class WatIE(InfoExtractor): - _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)' + _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|https?://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)' IE_NAME = 'wat.tv' _TESTS = [ { diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index a851578e0..31c904303 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -244,7 +244,7 @@ class WDRMobileIE(InfoExtractor): class WDRMausIE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' + _VALID_URL = r'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' IE_DESC = 'Sendung mit der Maus' _TESTS = [{ 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', diff --git a/youtube_dl/extractor/weiqitv.py b/youtube_dl/extractor/weiqitv.py index e333ae345..3dafbeec2 100644 --- a/youtube_dl/extractor/weiqitv.py +++ b/youtube_dl/extractor/weiqitv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class WeiqiTVIE(InfoExtractor): IE_DESC = 'WQTV' - _VALID_URL = r'http://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)' + _VALID_URL = r'https?://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3', diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index fb0accac7..828c03dc3 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -5,7 +5,7 @@ from .youtube import YoutubeIE class WimpIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.wimp.com/maruexhausted/', 'md5': 'ee21217ffd66d058e8b16be340b74883', diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 41061dd31..8b14840a2 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, sanitized_Request, + int_or_none, ) @@ -18,6 +19,9 @@ class WistiaIE(InfoExtractor): 'id': 'sh7fpupwlt', 'ext': 'mov', 'title': 'Being Resourceful', + 'description': 'a Clients From Hell Video Series video from worldwidewebhosting', + 'upload_date': '20131204', + 'timestamp': 1386185018, 'duration': 117, }, } @@ -32,35 +36,43 @@ class WistiaIE(InfoExtractor): raise ExtractorError('Error while getting the playlist', expected=True) data = data_json['media'] + title = data['name'] formats = [] thumbnails = [] for a in data['assets']: + astatus = a.get('status') atype = a.get('type') - if atype == 'still': + if (astatus is not None and astatus != 2) or atype == 'preview': + continue + elif atype in ('still', 'still_image'): thumbnails.append({ 'url': a['url'], 'resolution': '%dx%d' % (a['width'], a['height']), }) - continue - if atype == 'preview': - continue - formats.append({ - 'format_id': atype, - 'url': a['url'], - 'width': a['width'], - 'height': a['height'], - 'filesize': a['size'], - 'ext': a['ext'], - 'preference': 1 if atype == 'original' else None, - }) + else: + formats.append({ + 'format_id': atype, + 'url': a['url'], + 'tbr': int_or_none(a.get('bitrate')), + 'vbr': int_or_none(a.get('opt_vbitrate')), + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), + 'filesize': int_or_none(a.get('size')), + 'vcodec': a.get('codec'), + 'container': a.get('container'), + 'ext': a.get('ext'), + 'preference': 1 if atype == 'original' else None, + }) self._sort_formats(formats) return { 'id': video_id, - 'title': data['name'], + 'title': title, + 'description': data.get('seoDescription'), 'formats': formats, 'thumbnails': thumbnails, - 'duration': data.get('duration'), + 'duration': int_or_none(data.get('duration')), + 'timestamp': int_or_none(data.get('createdAt')), } diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py index 4ff99e5ca..e4a2baad2 100644 --- a/youtube_dl/extractor/xbef.py +++ b/youtube_dl/extractor/xbef.py @@ -5,7 +5,7 @@ from ..compat import compat_urllib_parse_unquote class XBefIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking', 'md5': 'a478b565baff61634a98f5e5338be995', diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 94abdb4f3..2d1504eaa 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -4,12 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, - encode_dict, int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -109,7 +108,7 @@ class XFileShareIE(InfoExtractor): if countdown: self._sleep(countdown, video_id) - post = compat_urllib_parse.urlencode(encode_dict(fields)) + post = urlencode_postdata(fields) req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index fd43e8854..b3547174d 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + dict_get, float_or_none, int_or_none, unified_strdate, @@ -170,6 +171,12 @@ class XHamsterEmbedIE(InfoExtractor): video_url = self._search_regex( r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, - webpage, 'xhamster url') + webpage, 'xhamster url', default=None) + + if not video_url: + vars = self._parse_json( + self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'), + video_id) + video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) return self.url_result(video_url, 'XHamster') diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 4c6142927..b2d8f4b48 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -8,6 +8,7 @@ import re from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -303,7 +304,7 @@ class YahooIE(InfoExtractor): region = self._search_regex( r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'protocol': 'http', 'region': region, }) diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py index 001ee17b6..63bbc0634 100644 --- a/youtube_dl/extractor/yam.py +++ b/youtube_dl/extractor/yam.py @@ -15,7 +15,7 @@ from ..utils import ( class YamIE(InfoExtractor): IE_DESC = '蕃薯藤yam天空部落' - _VALID_URL = r'http://mymedia.yam.com/m/(?P<id>\d+)' + _VALID_URL = r'https?://mymedia.yam.com/m/(?P<id>\d+)' _TESTS = [{ # An audio hosted on Yam diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index e699e663f..025716958 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -5,15 +5,13 @@ import re import hashlib from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, float_or_none, sanitized_Request, + urlencode_postdata, ) @@ -170,14 +168,14 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) request = sanitized_Request( 'https://music.yandex.ru/handlers/track-entries.jsx', - compat_urllib_parse.urlencode({ + urlencode_postdata({ 'entries': ','.join(missing_track_ids), 'lang': mu.get('settings', {}).get('lang', 'en'), 'external-domain': 'music.yandex.ru', 'overembed': 'false', 'sign': mu.get('authData', {}).get('user', {}).get('sign'), 'strict': 'true', - }).encode('utf-8')) + })) request.add_header('Referer', url) request.add_header('X-Requested-With', 'XMLHttpRequest') diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 869f3e819..0d943c343 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -9,7 +9,7 @@ from ..compat import compat_urllib_parse_unquote_plus class YnetIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html' + _VALID_URL = r'https?://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html' _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', @@ -41,10 +41,12 @@ class YnetIE(InfoExtractor): m = re.search(r'ynet - HOT -- (["\']+)(?P<title>.+?)\1', title) if m: title = m.group('title') + formats = self._extract_f4m_formats(f4m_url, video_id) + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'formats': self._extract_f4m_formats(f4m_url, video_id), + 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 900eb2aba..fd7eb5a6d 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -8,7 +8,7 @@ import time from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_ord, ) from ..utils import ( @@ -138,7 +138,7 @@ class YoukuIE(InfoExtractor): '_00' + \ '/st/' + self.parse_ext_l(format) + \ '/fileid/' + get_fileid(format, n) + '?' + \ - compat_urllib_parse.urlencode(param) + compat_urllib_parse_urlencode(param) video_urls.append(video_url) video_urls_dict[format] = video_urls diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 27e67feb4..28355bf46 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -17,16 +17,15 @@ from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, compat_parse_qs, - compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, compat_str, ) from ..utils import ( clean_html, - encode_dict, error_to_compat_str, ExtractorError, float_or_none, @@ -45,6 +44,7 @@ from ..utils import ( unified_strdate, unsmuggle_url, uppercase_escape, + urlencode_postdata, ISO3166Utils, ) @@ -116,7 +116,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'hl': 'en_US', } - login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii') + login_data = urlencode_postdata(login_form_strs) req = sanitized_Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( @@ -149,7 +149,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'TrustDevice': 'on', }) - tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii') + tfa_data = urlencode_postdata(tfa_form_strs) tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) tfa_results = self._download_webpage( @@ -234,7 +234,9 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): + for playlist_id in orderedSet(re.findall( + r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', + content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -309,6 +311,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, @@ -1006,7 +1009,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue sub_formats = [] for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse.urlencode({ + params = compat_urllib_parse_urlencode({ 'lang': lang, 'v': video_id, 'fmt': ext, @@ -1055,7 +1058,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if caption_url: timestamp = args['timestamp'] # We get the available subtitles - list_params = compat_urllib_parse.urlencode({ + list_params = compat_urllib_parse_urlencode({ 'type': 'list', 'tlangs': 1, 'asrs': 1, @@ -1074,7 +1077,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sub_lang = lang_node.attrib['lang_code'] sub_formats = [] for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse.urlencode({ + params = compat_urllib_parse_urlencode({ 'lang': original_lang, 'tlang': sub_lang, 'fmt': ext, @@ -1093,7 +1096,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): caption_tracks = args['caption_tracks'] caption_translation_languages = args['caption_translation_languages'] caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - parsed_caption_url = compat_urlparse.urlparse(caption_url) + parsed_caption_url = compat_urllib_parse_urlparse(caption_url) caption_qs = compat_parse_qs(parsed_caption_url.query) sub_lang_list = {} @@ -1109,7 +1112,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'fmt': [ext], }) sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( - query=compat_urllib_parse.urlencode(caption_qs, True))) + query=compat_urllib_parse_urlencode(caption_qs, True))) sub_formats.append({ 'url': sub_url, 'ext': ext, @@ -1139,7 +1142,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'cpn': [cpn], }) playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) self._download_webpage( playback_url, video_id, 'Marking watched', @@ -1224,7 +1227,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # this can be viewed without login into Youtube url = proto + '://www.youtube.com/embed/%s' % video_id embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( @@ -1910,7 +1913,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) + return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) + else super(YoutubeChannelIE, cls).suitable(url)) def _real_extract(self, url): channel_id = self._match_id(url) @@ -1985,6 +1989,51 @@ class YoutubeUserIE(YoutubeChannelIE): return super(YoutubeUserIE, cls).suitable(url) +class YoutubeLiveIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com live streams' + _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+))/live' + IE_NAME = 'youtube:live' + + _TESTS = [{ + 'url': 'http://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + base_url = mobj.group('base_url') + webpage = self._download_webpage(url, channel_id, fatal=False) + if webpage: + page_type = self._og_search_property( + 'type', webpage, 'page type', default=None) + video_id = self._html_search_meta( + 'videoId', webpage, 'video id', default=None) + if page_type == 'video' and video_id and re.match(r'^[0-9A-Za-z_-]{11}$', video_id): + return self.url_result(video_id, YoutubeIE.ie_key()) + return self.url_result(base_url) + + class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' @@ -2038,7 +2087,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): 'spf': 'navigate', } url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query) + result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) data = self._download_json( result_url, video_id='query "%s"' % query, note='Downloading page %s' % pagenum, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 9dd7a8034..7819f14ab 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -399,6 +399,10 @@ def parseOpts(overrideArguments=None): '-R', '--retries', dest='retries', metavar='RETRIES', default=10, help='Number of retries (default is %default), or "infinite".') + downloader.add_option( + '--fragment-retries', + dest='fragment_retries', metavar='RETRIES', default=10, + help='Number of retries for a fragment (default is %default), or "infinite" (DASH only)') downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', @@ -720,7 +724,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--embed-subs', action='store_true', dest='embedsubtitles', default=False, - help='Embed subtitles in the video (only for mkv and mp4 videos)') + help='Embed subtitles in the video (only for mp4, webm and mkv videos)') postproc.add_option( '--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False, diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index a8819f258..b64cd396b 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -331,17 +331,34 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): def run(self, information): - if information['ext'] not in ['mp4', 'mkv']: - self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files') + if information['ext'] not in ('mp4', 'webm', 'mkv'): + self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4, webm or mkv files') return [], information subtitles = information.get('requested_subtitles') if not subtitles: self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') return [], information - sub_langs = list(subtitles.keys()) filename = information['filepath'] - sub_filenames = [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()] + + ext = information['ext'] + sub_langs = [] + sub_filenames = [] + webm_vtt_warn = False + + for lang, sub_info in subtitles.items(): + sub_ext = sub_info['ext'] + if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': + sub_langs.append(lang) + sub_filenames.append(subtitles_filename(filename, lang, sub_ext)) + else: + if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': + webm_vtt_warn = True + self._downloader.to_screen('[ffmpeg] Only WebVTT subtitles can be embedded in webm files') + + if not sub_langs: + return [], information + input_files = [filename] + sub_filenames opts = [ @@ -519,7 +536,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): sub_filenames.append(old_file) new_file = subtitles_filename(filename, lang, new_ext) - if ext == 'dfxp' or ext == 'ttml': + if ext == 'dfxp' or ext == 'ttml' or ext == 'tt': self._downloader.report_warning( 'You have requested to convert dfxp (TTML) subtitles into another format, ' 'which results in style information loss') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9fd0ec8d5..5c4ab2748 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParser, compat_basestring, compat_chr, compat_etree_fromstring, @@ -46,9 +47,11 @@ from .compat import ( compat_str, compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xpath, shlex_quote, ) @@ -164,12 +167,7 @@ if sys.version_info >= (2, 7): return node.find(expr) else: def find_xpath_attr(node, xpath, key, val=None): - # Here comes the crazy part: In 2.6, if the xpath is a unicode, - # .//node does not match if a node is a direct child of . ! - if isinstance(xpath, compat_str): - xpath = xpath.encode('ascii') - - for f in node.findall(xpath): + for f in node.findall(compat_xpath(xpath)): if key not in f.attrib: continue if val is None or f.attrib.get(key) == val: @@ -194,9 +192,7 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def _find_xpath(xpath): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') - return node.find(xpath) + return node.find(compat_xpath(xpath)) if isinstance(xpath, (str, compat_str)): n = _find_xpath(xpath) @@ -273,6 +269,38 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = {} + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + + +def extract_attributes(html_element): + """Given a string for an HTML element such as + <el + a="foo" B="bar" c="&98;az" d=boz + empty= noval entity="&" + sq='"' dq="'" + > + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + parser.feed(html_element) + parser.close() + return parser.attrs + + def clean_html(html): """Clean an HTML snippet into a readable string""" @@ -389,9 +417,12 @@ def sanitize_path(s): # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of # unwanted failures due to missing protocol +def sanitize_url(url): + return 'http:%s' % url if url.startswith('//') else url + + def sanitized_Request(url, *args, **kwargs): - return compat_urllib_request.Request( - 'http:%s' % url if url.startswith('//') else url, *args, **kwargs) + return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) def orderedSet(iterable): @@ -747,12 +778,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): # Substitute URL if any change after escaping if url != url_escaped: - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request - new_req = req_type( - url_escaped, data=req.data, headers=req.headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - new_req.timeout = req.timeout - req = new_req + req = update_Request(req, url=url_escaped) for h, v in std_headers.items(): # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 @@ -1288,7 +1314,7 @@ def shell_quote(args): def smuggle_url(url, data): """ Pass additional data in a URL for internal use. """ - sdata = compat_urllib_parse.urlencode( + sdata = compat_urllib_parse_urlencode( {'__youtubedl_smuggle': json.dumps(data)}) return url + '#' + sdata @@ -1319,7 +1345,7 @@ def format_bytes(bytes): def lookup_unit_table(unit_table, s): units_re = '|'.join(re.escape(u) for u in unit_table) m = re.match( - r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s) + r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s) if not m: return None num_str = m.group('num').replace(',', '.') @@ -1719,6 +1745,7 @@ def escape_url(url): """Escape URL as suggested by RFC 3986""" url_parsed = compat_urllib_parse_urlparse(url) return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), path=escape_rfc3986(url_parsed.path), params=escape_rfc3986(url_parsed.params), query=escape_rfc3986(url_parsed.query), @@ -1728,7 +1755,8 @@ def escape_url(url): try: struct.pack('!I', 0) except TypeError: - # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument + # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument + # See https://bugs.python.org/issue19099 def struct_pack(spec, *args): if isinstance(spec, compat_str): spec = spec.encode('ascii') @@ -1760,22 +1788,29 @@ def read_batch_urls(batch_fd): def urlencode_postdata(*args, **kargs): - return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') + return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') def update_url_query(url, query): parsed_url = compat_urlparse.urlparse(url) qs = compat_parse_qs(parsed_url.query) qs.update(query) - qs = encode_dict(qs) return compat_urlparse.urlunparse(parsed_url._replace( - query=compat_urllib_parse.urlencode(qs, True))) + query=compat_urllib_parse_urlencode(qs, True))) -def encode_dict(d, encoding='utf-8'): - def encode(v): - return v.encode(encoding) if isinstance(v, compat_basestring) else v - return dict((encode(k), encode(v)) for k, v in d.items()) +def update_Request(req, url=None, data=None, headers={}, query={}): + req_headers = req.headers.copy() + req_headers.update(headers) + req_data = data or req.data + req_url = update_url_query(url or req.get_full_url(), query) + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + new_req = req_type( + req_url, data=req_data, headers=req_headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + if hasattr(req, 'timeout'): + new_req.timeout = req.timeout + return new_req def dict_get(d, key_or_keys, default=None, skip_false_values=True): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9216fa547..d9e1cb2a8 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.03.14' +__version__ = '2016.04.01'