diff --git a/.gitignore b/.gitignore index 0422adf44..26dbde73d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.pyc *.pyo +*.class *~ *.DS_Store wine-py2exe/ @@ -32,4 +33,4 @@ test/testdata .tox youtube-dl.zsh .idea -.idea/* \ No newline at end of file +.idea/* diff --git a/AUTHORS b/AUTHORS index b51e23f2d..ea8d39978 100644 --- a/AUTHORS +++ b/AUTHORS @@ -161,3 +161,9 @@ Jens Wille Robin Houtevelts Patrick Griffis Aidan Rowe +mutantmonkey +Ben Congdon +Kacper Michajłow +José Joaquín Atria +Viťas Strádal +Kagami Hiiragi diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c996f03ab..0df6193fb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,7 +85,7 @@ To run the test, simply invoke your favorite test runner, or execute a test file If you want to create a build of youtube-dl yourself, you'll need * python -* make +* make (both GNU make and BSD make are supported) * pandoc * zip * nosetests diff --git a/Makefile b/Makefile index cb449b7e6..6689ec06f 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas clean: rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete + find . -name "*.class" -delete PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin @@ -11,15 +12,7 @@ SHAREDIR ?= $(PREFIX)/share PYTHON ?= /usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local -ifeq ($(PREFIX),/usr) - SYSCONFDIR=/etc -else - ifeq ($(PREFIX),/usr/local) - SYSCONFDIR=/etc - else - SYSCONFDIR=$(PREFIX)/etc - endif -endif +SYSCONFDIR != if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish install -d $(DESTDIR)$(BINDIR) @@ -44,7 +37,7 @@ test: ot: offlinetest offlinetest: codetest - nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py + $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py tar: youtube-dl.tar.gz diff --git a/README.md b/README.md index 45d4c29ca..eff3ac6c6 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,8 @@ which means you can modify it, redistribute it or use it however you like. on Windows) --flat-playlist Do not extract the videos of a playlist, only list them. + --mark-watched Mark videos watched (YouTube only) + --no-mark-watched Do not mark videos watched (YouTube only) --no-color Do not emit color codes in output ## Network Options: @@ -172,6 +174,8 @@ which means you can modify it, redistribute it or use it however you like. (e.g. 50K or 4.2M) -R, --retries RETRIES Number of retries (default is 10), or "infinite". + --fragment-retries RETRIES Number of retries for a fragment (default + is 10), or "infinite" (DASH only) --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) (default is 1024) --no-resize-buffer Do not automatically adjust the buffer @@ -189,7 +193,7 @@ which means you can modify it, redistribute it or use it however you like. to play it) --external-downloader COMMAND Use the specified external downloader. Currently supports - aria2c,axel,curl,httpie,wget + aria2c,avconv,axel,curl,ffmpeg,httpie,wget --external-downloader-args ARGS Give these arguments to the external downloader @@ -384,8 +388,8 @@ which means you can modify it, redistribute it or use it however you like. --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default - --embed-subs Embed subtitles in the video (only for mkv - and mp4 videos) + --embed-subs Embed subtitles in the video (only for mp4, + webm and mkv videos) --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / @@ -839,7 +843,7 @@ To run the test, simply invoke your favorite test runner, or execute a test file If you want to create a build of youtube-dl yourself, you'll need * python -* make +* make (both GNU make and BSD make are supported) * pandoc * zip * nosetests diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 43403233d..00b8c247c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -54,6 +54,7 @@ - **AtresPlayer** - **ATTTechChannel** - **AudiMedia** + - **AudioBoom** - **audiomack** - **audiomack:album** - **Azubu** @@ -73,6 +74,7 @@ - **Bigflix** - **Bild**: Bild.de - **BiliBili** + - **BioBioChileTV** - **BleacherReport** - **BleacherReportCMS** - **blinkx** @@ -80,6 +82,7 @@ - **BokeCC** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk Mediathek + - **BravoTV** - **Break** - **brightcove:legacy** - **brightcove:new** @@ -98,6 +101,7 @@ - **CBSNews**: CBS News - **CBSNewsLiveVideo**: CBS News Live Videos - **CBSSports** + - **CDA** - **CeskaTelevize** - **channel9**: Channel 9 - **Chaturbate** @@ -167,6 +171,8 @@ - **Dump** - **Dumpert** - **dvtv**: http://video.aktualne.cz/ + - **dw** + - **dw:article** - **EaglePlatform** - **EbaumsWorld** - **EchoMsk** @@ -190,10 +196,10 @@ - **ExpoTV** - **ExtremeTube** - **facebook** - - **facebook:post** - **faz.net** - **fc2** - **Fczenit** + - **features.aol.com** - **fernsehkritik.tv** - **Firstpost** - **FiveTV** @@ -240,6 +246,7 @@ - **GPUTechConf** - **Groupon** - **Hark** + - **HBO** - **HearThisAt** - **Heise** - **HellPorno** @@ -293,6 +300,7 @@ - **kontrtube**: KontrTube.ru - Труба зовёт - **KrasView**: Красвью - **Ku6** + - **KUSI** - **kuwo:album**: 酷我音乐 - 专辑 - **kuwo:category**: 酷我音乐 - 分类 - **kuwo:chart**: 酷我音乐 - 排行榜 @@ -301,12 +309,11 @@ - **kuwo:song**: 酷我音乐 - **la7.tv** - **Laola1Tv** + - **Le**: 乐视网 - **Lecture2Go** - **Lemonde** - - **Letv**: 乐视网 + - **LePlaylist** - **LetvCloud**: 乐视云 - - **LetvPlaylist** - - **LetvTv** - **Libsyn** - **life:embed** - **lifenews**: LIFE | NEWS @@ -324,6 +331,7 @@ - **m6** - **macgamestore**: MacGameStore trailers - **mailru**: Видео@Mail.Ru + - **MakersChannel** - **MakerTV** - **Malemotion** - **MatchTV** @@ -334,10 +342,12 @@ - **Mgoon** - **Minhateca** - **MinistryGrid** + - **Minoto** - **miomio.tv** - **MiTele**: mitele.es - **mixcloud** - **MLB** + - **Mnet** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - **Mojvideo** @@ -421,6 +431,7 @@ - **Npr** - **NRK** - **NRKPlaylist** + - **NRKSkole**: NRK Skole - **NRKTV**: NRK TV and NRK Radio - **ntv.ru** - **Nuvid** @@ -433,6 +444,7 @@ - **OnionStudios** - **Ooyala** - **OoyalaExternal** + - **Openload** - **OraTV** - **orf:fm4**: radio FM4 - **orf:iptv**: iptv.ORF.at @@ -493,6 +505,7 @@ - **Restudy** - **ReverbNation** - **Revision3** + - **RICE** - **RingTV** - **RottenTomatoes** - **Roxwel** @@ -517,6 +530,7 @@ - **RUTV**: RUTV.RU - **Ruutu** - **safari**: safaribooksonline.com online video + - **safari:api** - **safari:course**: safaribooksonline.com online courses - **Sandia**: Sandia National Laboratories - **Sapo**: SAPO Vídeos @@ -610,7 +624,9 @@ - **TheOnion** - **ThePlatform** - **ThePlatformFeed** + - **TheScene** - **TheSixtyOne** + - **TheStar** - **ThisAmericanLife** - **ThisAV** - **THVideo** @@ -644,6 +660,7 @@ - **tv.dfb.de** - **TV2** - **TV2Article** + - **TV3** - **TV4**: tv4.se and tv4play.se - **TVC** - **TVCArticle** @@ -669,6 +686,7 @@ - **UDNEmbed**: 聯合影音 - **Unistra** - **Urort**: NRK P3 Urørt + - **USAToday** - **ustream** - **ustream:channel** - **Ustudio** @@ -682,6 +700,7 @@ - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** - **Vice** + - **ViceShow** - **Viddler** - **video.google:search**: Google Video search - **video.mit.edu** @@ -709,6 +728,7 @@ - **vimeo:channel** - **vimeo:group** - **vimeo:likes**: Vimeo user likes + - **vimeo:ondemand** - **vimeo:review**: Review pages on vimeo - **vimeo:user** - **vimeo:watchlater**: Vimeo watch later list, "vimeowatchlater" keyword (requires authentication) @@ -773,6 +793,7 @@ - **youtube:channel**: YouTube.com channels - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) + - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists - **youtube:playlists**: YouTube.com user/channel playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) diff --git a/test/helper.py b/test/helper.py index bdd7acca4..f2d878212 100644 --- a/test/helper.py +++ b/test/helper.py @@ -11,8 +11,11 @@ import sys import youtube_dl.extractor from youtube_dl import YoutubeDL -from youtube_dl.utils import ( +from youtube_dl.compat import ( + compat_os_name, compat_str, +) +from youtube_dl.utils import ( preferredencoding, write_string, ) @@ -42,7 +45,7 @@ def report_warning(message): Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored ''' - if sys.stderr.isatty() and os.name != 'nt': + if sys.stderr.isatty() and compat_os_name != 'nt': _msg_header = '\033[0;33mWARNING:\033[0m' else: _msg_header = 'WARNING:' diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 59f7ab49d..ca25025e2 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -222,6 +222,11 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'dash-video-low') + ydl = YDL({'format': 'bestvideo[format_id^=dash][format_id$=low]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'dash-video-low') + formats = [ {'format_id': 'vid-vcodec-dot', 'ext': 'mp4', 'preference': 1, 'vcodec': 'avc1.123456', 'acodec': 'none', 'url': TEST_URL}, ] @@ -502,6 +507,9 @@ class TestYoutubeDL(unittest.TestCase): assertRegexpMatches(self, ydl._format_note({ 'vbr': 10, }), '^\s*10k$') + assertRegexpMatches(self, ydl._format_note({ + 'fps': 30, + }), '^30fps$') def test_postprocessors(self): filename = 'post-processor-testfile.mp4' diff --git a/test/test_http.py b/test/test_http.py index f2e305b6f..15e0ad369 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding: utf-8 from __future__ import unicode_literals # Allow direct execution @@ -52,7 +53,12 @@ class TestHTTP(unittest.TestCase): ('localhost', 0), HTTPTestRequestHandler) self.httpd.socket = ssl.wrap_socket( self.httpd.socket, certfile=certfn, server_side=True) - self.port = self.httpd.socket.getsockname()[1] + if os.name == 'java': + # In Jython SSLSocket is not a subclass of socket.socket + sock = self.httpd.socket.sock + else: + sock = self.httpd.socket + self.port = sock.getsockname()[1] self.server_thread = threading.Thread(target=self.httpd.serve_forever) self.server_thread.daemon = True self.server_thread.start() @@ -115,5 +121,14 @@ class TestProxy(unittest.TestCase): response = ydl.urlopen(req).read().decode('utf-8') self.assertEqual(response, 'cn: {0}'.format(url)) + def test_proxy_with_idn(self): + ydl = YoutubeDL({ + 'proxy': 'localhost:{0}'.format(self.port), + }) + url = 'http://中文.tw/' + response = ydl.urlopen(url).read().decode('utf-8') + # b'xn--fiq228c' is '中文'.encode('idna') + self.assertEqual(response, 'normal: http://xn--fiq228c.tw/') + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 22597f415..f91628f36 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -28,6 +28,7 @@ from youtube_dl.utils import ( encodeFilename, escape_rfc3986, escape_url, + extract_attributes, ExtractorError, find_xpath_attr, fix_xml_ampersands, @@ -41,6 +42,7 @@ from youtube_dl.utils import ( orderedSet, parse_duration, parse_filesize, + parse_count, parse_iso8601, read_batch_urls, sanitize_filename, @@ -61,6 +63,7 @@ from youtube_dl.utils import ( lowercase_escape, url_basename, urlencode_postdata, + update_url_query, version_tuple, xpath_with_ns, xpath_element, @@ -75,7 +78,10 @@ from youtube_dl.utils import ( cli_bool_option, ) from youtube_dl.compat import ( + compat_chr, compat_etree_fromstring, + compat_urlparse, + compat_parse_qs, ) @@ -454,6 +460,40 @@ class TestUtil(unittest.TestCase): data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'}) self.assertTrue(isinstance(data, bytes)) + def test_update_url_query(self): + def query_dict(url): + return compat_parse_qs(compat_urlparse.urlparse(url).query) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'quality': ['HD'], 'format': ['mp4']})), + query_dict('http://example.com/path?quality=HD&format=mp4')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'system': ['LINUX', 'WINDOWS']})), + query_dict('http://example.com/path?system=LINUX&system=WINDOWS')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'fields': 'id,formats,subtitles'})), + query_dict('http://example.com/path?fields=id,formats,subtitles')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'fields': ('id,formats,subtitles', 'thumbnails')})), + query_dict('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path?manifest=f4m', {'manifest': []})), + query_dict('http://example.com/path')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path?system=LINUX&system=WINDOWS', {'system': 'LINUX'})), + query_dict('http://example.com/path?system=LINUX')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'fields': b'id,formats,subtitles'})), + query_dict('http://example.com/path?fields=id,formats,subtitles')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'width': 1080, 'height': 720})), + query_dict('http://example.com/path?width=1080&height=720')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'bitrate': 5020.43})), + query_dict('http://example.com/path?bitrate=5020.43')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'test': '第二行тест'})), + query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) + def test_dict_get(self): FALSE_VALUES = { 'none': None, @@ -537,11 +577,11 @@ class TestUtil(unittest.TestCase): ) self.assertEqual( escape_url('http://тест.рф/фрагмент'), - 'http://тест.рф/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' + 'http://xn--e1aybc.xn--p1ai/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' ) self.assertEqual( escape_url('http://тест.рф/абв?абв=абв#абв'), - 'http://тест.рф/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' + 'http://xn--e1aybc.xn--p1ai/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' ) self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') @@ -591,6 +631,44 @@ class TestUtil(unittest.TestCase): on = js_to_json('{"abc": "def",}') self.assertEqual(json.loads(on), {'abc': 'def'}) + def test_extract_attributes(self): + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': "a 'b' c"}) + self.assertEqual(extract_attributes(''), {'x': 'a "b" c'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '&'}) # XML + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes(''), {'x': '&foo'}) + self.assertEqual(extract_attributes(''), {'x': "'"}) + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': None}) + self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) + self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) + self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'}) + self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'}) + # "Narrow" Python builds don't support unicode code points outside BMP. + try: + compat_chr(0x10000) + supports_outside_bmp = True + except ValueError: + supports_outside_bmp = False + if supports_outside_bmp: + self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'}) + def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') @@ -616,6 +694,17 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_filesize('1.2Tb'), 1200000000000) self.assertEqual(parse_filesize('1,24 KB'), 1240) + def test_parse_count(self): + self.assertEqual(parse_count(None), None) + self.assertEqual(parse_count(''), None) + self.assertEqual(parse_count('0'), 0) + self.assertEqual(parse_count('1000'), 1000) + self.assertEqual(parse_count('1.000'), 1000) + self.assertEqual(parse_count('1.1k'), 1100) + self.assertEqual(parse_count('1.1kk'), 1100000) + self.assertEqual(parse_count('1.1kk '), 1100000) + self.assertEqual(parse_count('1.1kk views'), 1100000) + def test_version_tuple(self): self.assertEqual(version_tuple('1'), (1,)) self.assertEqual(version_tuple('10.23.344'), (10, 23, 344)) diff --git a/tox.ini b/tox.ini index 48504329f..2d7134005 100644 --- a/tox.ini +++ b/tox.ini @@ -8,6 +8,6 @@ deps = passenv = HOME defaultargs = test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py - --exclude test_youtube_lists.py + --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=youtube_dl --cover-html # test.test_download:TestDownload.test_NowVideo diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8f3a8b9e3..29d7a3106 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -24,9 +24,6 @@ import time import tokenize import traceback -if os.name == 'nt': - import ctypes - from .compat import ( compat_basestring, compat_cookiejar, @@ -34,6 +31,7 @@ from .compat import ( compat_get_terminal_size, compat_http_client, compat_kwargs, + compat_os_name, compat_str, compat_tokenize_tokenize, compat_urllib_error, @@ -87,6 +85,7 @@ from .extractor import get_info_extractor, gen_extractors from .downloader import get_suitable_downloader from .downloader.rtmp import rtmpdump_version from .postprocessor import ( + FFmpegFixupM3u8PP, FFmpegFixupM4aPP, FFmpegFixupStretchedPP, FFmpegMergerPP, @@ -95,6 +94,9 @@ from .postprocessor import ( ) from .version import __version__ +if compat_os_name == 'nt': + import ctypes + class YoutubeDL(object): """YoutubeDL class. @@ -450,7 +452,7 @@ class YoutubeDL(object): def to_console_title(self, message): if not self.params.get('consoletitle', False): return - if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): + if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): # c_wchar_p() might not be necessary if `message` is # already of type unicode() ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) @@ -521,7 +523,7 @@ class YoutubeDL(object): else: if self.params.get('no_warnings'): return - if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': + if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': _msg_header = '\033[0;33mWARNING:\033[0m' else: _msg_header = 'WARNING:' @@ -533,7 +535,7 @@ class YoutubeDL(object): Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. ''' - if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': + if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': _msg_header = '\033[0;31mERROR:\033[0m' else: _msg_header = 'ERROR:' @@ -566,7 +568,7 @@ class YoutubeDL(object): elif template_dict.get('height'): template_dict['resolution'] = '%sp' % template_dict['height'] elif template_dict.get('width'): - template_dict['resolution'] = '?x%d' % template_dict['width'] + template_dict['resolution'] = '%dx?' % template_dict['width'] sanitize = lambda k, v: sanitize_filename( compat_str(v), @@ -903,7 +905,7 @@ class YoutubeDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?Pext|acodec|vcodec|container|protocol) + \s*(?Pext|acodec|vcodec|container|protocol|format_id) \s*(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9._-]+) \s*$ @@ -1232,6 +1234,10 @@ class YoutubeDL(object): if t.get('id') is None: t['id'] = '%d' % i + if self.params.get('list_thumbnails'): + self.list_thumbnails(info_dict) + return + if thumbnails and 'thumbnail' not in info_dict: info_dict['thumbnail'] = thumbnails[-1]['url'] @@ -1333,9 +1339,6 @@ class YoutubeDL(object): if self.params.get('listformats'): self.list_formats(info_dict) return - if self.params.get('list_thumbnails'): - self.list_thumbnails(info_dict) - return req_format = self.params.get('format') if req_format is None: @@ -1637,6 +1640,8 @@ class YoutubeDL(object): if fixup_policy is None: fixup_policy = 'detect_or_warn' + INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.' + stretched_ratio = info_dict.get('stretched_ratio') if stretched_ratio is not None and stretched_ratio != 1: if fixup_policy == 'warn': @@ -1649,15 +1654,18 @@ class YoutubeDL(object): info_dict['__postprocessors'].append(stretched_pp) else: self.report_warning( - '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % ( - info_dict['id'], stretched_ratio)) + '%s: Non-uniform pixel ratio (%s). %s' + % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE)) else: assert fixup_policy in ('ignore', 'never') - if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash': + if (info_dict.get('requested_formats') is None and + info_dict.get('container') == 'm4a_dash'): if fixup_policy == 'warn': - self.report_warning('%s: writing DASH m4a. Only some players support this container.' % ( - info_dict['id'])) + self.report_warning( + '%s: writing DASH m4a. ' + 'Only some players support this container.' + % info_dict['id']) elif fixup_policy == 'detect_or_warn': fixup_pp = FFmpegFixupM4aPP(self) if fixup_pp.available: @@ -1665,8 +1673,27 @@ class YoutubeDL(object): info_dict['__postprocessors'].append(fixup_pp) else: self.report_warning( - '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % ( - info_dict['id'])) + '%s: writing DASH m4a. ' + 'Only some players support this container. %s' + % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) + else: + assert fixup_policy in ('ignore', 'never') + + if (info_dict.get('protocol') == 'm3u8_native' or + info_dict.get('protocol') == 'm3u8' and + self.params.get('hls_prefer_native')): + if fixup_policy == 'warn': + self.report_warning('%s: malformated aac bitstream.' % ( + info_dict['id'])) + elif fixup_policy == 'detect_or_warn': + fixup_pp = FFmpegFixupM3u8PP(self) + if fixup_pp.available: + info_dict.setdefault('__postprocessors', []) + info_dict['__postprocessors'].append(fixup_pp) + else: + self.report_warning( + '%s: malformated aac bitstream. %s' + % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) else: assert fixup_policy in ('ignore', 'never') @@ -1809,7 +1836,7 @@ class YoutubeDL(object): if fdict.get('language'): if res: res += ' ' - res += '[%s]' % fdict['language'] + res += '[%s] ' % fdict['language'] if fdict.get('format_note') is not None: res += fdict['format_note'] + ' ' if fdict.get('tbr') is not None: @@ -1830,7 +1857,9 @@ class YoutubeDL(object): if fdict.get('vbr') is not None: res += '%4dk' % fdict['vbr'] if fdict.get('fps') is not None: - res += ', %sfps' % fdict['fps'] + if res: + res += ', ' + res += '%sfps' % fdict['fps'] if fdict.get('acodec') is not None: if res: res += ', ' @@ -1873,13 +1902,8 @@ class YoutubeDL(object): def list_thumbnails(self, info_dict): thumbnails = info_dict.get('thumbnails') if not thumbnails: - tn_url = info_dict.get('thumbnail') - if tn_url: - thumbnails = [{'id': '0', 'url': tn_url}] - else: - self.to_screen( - '[info] No thumbnails present for %s' % info_dict['id']) - return + self.to_screen('[info] No thumbnails present for %s' % info_dict['id']) + return self.to_screen( '[info] Thumbnails for %s:' % info_dict['id']) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 79b389840..737f6545d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -144,14 +144,20 @@ def _real_main(argv=None): if numeric_limit is None: parser.error('invalid max_filesize specified') opts.max_filesize = numeric_limit - if opts.retries is not None: - if opts.retries in ('inf', 'infinite'): - opts_retries = float('inf') + + def parse_retries(retries): + if retries in ('inf', 'infinite'): + parsed_retries = float('inf') else: try: - opts_retries = int(opts.retries) + parsed_retries = int(retries) except (TypeError, ValueError): parser.error('invalid retry count specified') + return parsed_retries + if opts.retries is not None: + opts.retries = parse_retries(opts.retries) + if opts.fragment_retries is not None: + opts.fragment_retries = parse_retries(opts.fragment_retries) if opts.buffersize is not None: numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) if numeric_buffersize is None: @@ -299,7 +305,8 @@ def _real_main(argv=None): 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, 'nooverwrites': opts.nooverwrites, - 'retries': opts_retries, + 'retries': opts.retries, + 'fragment_retries': opts.fragment_retries, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, 'continuedl': opts.continue_dl, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b497da696..dbb91a6ef 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -77,6 +77,11 @@ try: except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +try: + from html.parser import HTMLParser as compat_HTMLParser +except ImportError: # Python 2 + from HTMLParser import HTMLParser as compat_HTMLParser + try: from subprocess import DEVNULL @@ -251,6 +256,16 @@ else: el.text = el.text.decode('utf-8') return doc +if sys.version_info < (2, 7): + # Here comes the crazy part: In 2.6, if the xpath is a unicode, + # .//node does not match if a node is a direct child of . ! + def compat_xpath(xpath): + if isinstance(xpath, compat_str): + xpath = xpath.encode('ascii') + return xpath +else: + compat_xpath = lambda xpath: xpath + try: from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 @@ -326,6 +341,9 @@ def compat_ord(c): return ord(c) +compat_os_name = os._name if os.name == 'java' else os.name + + if sys.version_info >= (3, 0): compat_getenv = os.getenv compat_expanduser = os.path.expanduser @@ -346,7 +364,7 @@ else: # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib # for different platforms with correct environment variables decoding. - if os.name == 'posix': + if compat_os_name == 'posix': def compat_expanduser(path): """Expand ~ and ~user constructions. If user or $HOME is unknown, do nothing.""" @@ -370,7 +388,7 @@ else: userhome = pwent.pw_dir userhome = userhome.rstrip('/') return (userhome + path[i:]) or '/' - elif os.name == 'nt' or os.name == 'ce': + elif compat_os_name == 'nt' or compat_os_name == 'ce': def compat_expanduser(path): """Expand ~ and ~user constructs. @@ -540,6 +558,7 @@ else: from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ + 'compat_HTMLParser', 'compat_HTTPError', 'compat_basestring', 'compat_chr', @@ -556,6 +575,7 @@ __all__ = [ 'compat_itertools_count', 'compat_kwargs', 'compat_ord', + 'compat_os_name', 'compat_parse_qs', 'compat_print', 'compat_shlex_split', @@ -575,6 +595,7 @@ __all__ = [ 'compat_urlparse', 'compat_urlretrieve', 'compat_xml_parse_error', + 'compat_xpath', 'shlex_quote', 'subprocess_check_output', 'workaround_optparse_bug9161', diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index dccc59212..73b34fdae 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -1,14 +1,16 @@ from __future__ import unicode_literals from .common import FileDownloader -from .external import get_external_downloader from .f4m import F4mFD from .hls import HlsFD -from .hls import NativeHlsFD from .http import HttpFD -from .rtsp import RtspFD from .rtmp import RtmpFD from .dash import DashSegmentsFD +from .rtsp import RtspFD +from .external import ( + get_external_downloader, + FFmpegFD, +) from ..utils import ( determine_protocol, @@ -16,8 +18,8 @@ from ..utils import ( PROTOCOL_MAP = { 'rtmp': RtmpFD, - 'm3u8_native': NativeHlsFD, - 'm3u8': HlsFD, + 'm3u8_native': HlsFD, + 'm3u8': FFmpegFD, 'mms': RtspFD, 'rtsp': RtspFD, 'f4m': F4mFD, @@ -30,14 +32,17 @@ def get_suitable_downloader(info_dict, params={}): protocol = determine_protocol(info_dict) info_dict['protocol'] = protocol + # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): + # return FFmpegFD + external_downloader = params.get('external_downloader') if external_downloader is not None: ed = get_external_downloader(external_downloader) - if ed.supports(info_dict): + if ed.can_download(info_dict): return ed if protocol == 'm3u8' and params.get('hls_prefer_native'): - return NativeHlsFD + return HlsFD return PROTOCOL_MAP.get(protocol, HttpFD) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 2d5154051..1dba9f49a 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -5,6 +5,7 @@ import re import sys import time +from ..compat import compat_os_name from ..utils import ( encodeFilename, error_to_compat_str, @@ -114,6 +115,10 @@ class FileDownloader(object): return '%10s' % '---b/s' return '%10s' % ('%s/s' % format_bytes(speed)) + @staticmethod + def format_retries(retries): + return 'inf' if retries == float('inf') else '%.0f' % retries + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) @@ -219,7 +224,7 @@ class FileDownloader(object): if self.params.get('progress_with_newline', False): self.to_screen(fullmsg) else: - if os.name == 'nt': + if compat_os_name == 'nt': prev_len = getattr(self, '_report_progress_prev_line_length', 0) if prev_len > len(fullmsg): @@ -296,7 +301,9 @@ class FileDownloader(object): def report_retry(self, count, retries): """Report retry in case of HTTP error 5xx""" - self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %.0f)...' % (count, retries)) + self.to_screen( + '[download] Got server HTTP error. Retrying (attempt %d of %s)...' + % (count, self.format_retries(retries))) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8b1b17c6e..8bbab9dbc 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -4,6 +4,7 @@ import os import re from .fragment import FragmentFD +from ..compat import compat_urllib_error from ..utils import ( sanitize_open, encodeFilename, @@ -36,20 +37,41 @@ class DashSegmentsFD(FragmentFD): segments_filenames = [] - def append_url_to_file(target_url, target_filename): - success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) - if not success: + fragment_retries = self.params.get('fragment_retries', 0) + + def append_url_to_file(target_url, tmp_filename, segment_name): + target_filename = '%s-%s' % (tmp_filename, segment_name) + count = 0 + while count <= fragment_retries: + try: + success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) + if not success: + return False + down, target_sanitized = sanitize_open(target_filename, 'rb') + ctx['dest_stream'].write(down.read()) + down.close() + segments_filenames.append(target_sanitized) + break + except (compat_urllib_error.HTTPError, ) as err: + # YouTube may often return 404 HTTP error for a fragment causing the + # whole download to fail. However if the same fragment is immediately + # retried with the same request data this usually succeeds (1-2 attemps + # is usually enough) thus allowing to download the whole file successfully. + # So, we will retry all fragments that fail with 404 HTTP error for now. + if err.code != 404: + raise + # Retry fragment + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(segment_name, count, fragment_retries) + if count > fragment_retries: + self.report_error('giving up after %s fragment retries' % fragment_retries) return False - down, target_sanitized = sanitize_open(target_filename, 'rb') - ctx['dest_stream'].write(down.read()) - down.close() - segments_filenames.append(target_sanitized) if initialization_url: - append_url_to_file(initialization_url, ctx['tmpfilename'] + '-Init') + append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init') for i, segment_url in enumerate(segment_urls): - segment_filename = '%s-Seg%d' % (ctx['tmpfilename'], i) - append_url_to_file(segment_url, segment_filename) + append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i) self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 2bc011266..30277dc20 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -2,8 +2,11 @@ from __future__ import unicode_literals import os.path import subprocess +import sys +import re from .common import FileDownloader +from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS from ..utils import ( cli_option, cli_valueless_option, @@ -11,6 +14,8 @@ from ..utils import ( cli_configuration_args, encodeFilename, encodeArgument, + handle_youtubedl_headers, + check_executable, ) @@ -45,10 +50,18 @@ class ExternalFD(FileDownloader): def exe(self): return self.params.get('external_downloader') + @classmethod + def available(cls): + return check_executable(cls.get_basename(), [cls.AVAILABLE_OPT]) + @classmethod def supports(cls, info_dict): return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') + @classmethod + def can_download(cls, info_dict): + return cls.available() and cls.supports(info_dict) + def _option(self, command_option, param): return cli_option(self.params, command_option, param) @@ -76,6 +89,8 @@ class ExternalFD(FileDownloader): class CurlFD(ExternalFD): + AVAILABLE_OPT = '-V' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): @@ -89,6 +104,8 @@ class CurlFD(ExternalFD): class AxelFD(ExternalFD): + AVAILABLE_OPT = '-V' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): @@ -99,6 +116,8 @@ class AxelFD(ExternalFD): class WgetFD(ExternalFD): + AVAILABLE_OPT = '--version' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] for key, val in info_dict['http_headers'].items(): @@ -112,6 +131,8 @@ class WgetFD(ExternalFD): class Aria2cFD(ExternalFD): + AVAILABLE_OPT = '-v' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-c'] cmd += self._configuration_args([ @@ -130,12 +151,112 @@ class Aria2cFD(ExternalFD): class HttpieFD(ExternalFD): + @classmethod + def available(cls): + return check_executable('http', ['--version']) + def _make_cmd(self, tmpfilename, info_dict): cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] for key, val in info_dict['http_headers'].items(): cmd += ['%s:%s' % (key, val)] return cmd + +class FFmpegFD(ExternalFD): + @classmethod + def supports(cls, info_dict): + return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') + + @classmethod + def available(cls): + return FFmpegPostProcessor().available + + def _call_downloader(self, tmpfilename, info_dict): + url = info_dict['url'] + ffpp = FFmpegPostProcessor(downloader=self) + if not ffpp.available: + self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') + return False + ffpp.check_version() + + args = [ffpp.executable, '-y'] + + args += self._configuration_args() + + # start_time = info_dict.get('start_time') or 0 + # if start_time: + # args += ['-ss', compat_str(start_time)] + # end_time = info_dict.get('end_time') + # if end_time: + # args += ['-t', compat_str(end_time - start_time)] + + if info_dict['http_headers'] and re.match(r'^https?://', url): + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) + args += [ + '-headers', + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + + protocol = info_dict.get('protocol') + + if protocol == 'rtmp': + player_url = info_dict.get('player_url') + page_url = info_dict.get('page_url') + app = info_dict.get('app') + play_path = info_dict.get('play_path') + tc_url = info_dict.get('tc_url') + flash_version = info_dict.get('flash_version') + live = info_dict.get('rtmp_live', False) + if player_url is not None: + args += ['-rtmp_swfverify', player_url] + if page_url is not None: + args += ['-rtmp_pageurl', page_url] + if app is not None: + args += ['-rtmp_app', app] + if play_path is not None: + args += ['-rtmp_playpath', play_path] + if tc_url is not None: + args += ['-rtmp_tcurl', tc_url] + if flash_version is not None: + args += ['-rtmp_flashver', flash_version] + if live: + args += ['-rtmp_live', 'live'] + + args += ['-i', url, '-c', 'copy'] + if protocol == 'm3u8': + if self.params.get('hls_use_mpegts', False): + args += ['-f', 'mpegts'] + else: + args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + elif protocol == 'rtmp': + args += ['-f', 'flv'] + else: + args += ['-f', EXT_TO_OUT_FORMATS.get(info_dict['ext'], info_dict['ext'])] + + args = [encodeArgument(opt) for opt in args] + args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + + self._debug_cmd(args) + + proc = subprocess.Popen(args, stdin=subprocess.PIPE) + try: + retval = proc.wait() + except KeyboardInterrupt: + # subprocces.run would send the SIGKILL signal to ffmpeg and the + # mp4 file couldn't be played, but if we ask ffmpeg to quit it + # produces a file that is playable (this is mostly useful for live + # streams). Note that Windows is not affected and produces playable + # files (see https://github.com/rg3/youtube-dl/issues/8300). + if sys.platform != 'win32': + proc.communicate(b'q') + raise + return retval + + +class AVconvFD(FFmpegFD): + pass + _BY_NAME = dict( (klass.get_basename(), klass) for name, klass in globals().items() diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 5bc99492b..ba903ae10 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -19,8 +19,17 @@ class HttpQuietDownloader(HttpFD): class FragmentFD(FileDownloader): """ A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + + Available options: + + fragment_retries: Number of times to retry a fragment for HTTP error (DASH only) """ + def report_retry_fragment(self, fragment_name, count, retries): + self.to_screen( + '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %s)...' + % (fragment_name, count, self.format_retries(retries))) + def _prepare_and_start_frag_download(self, ctx): self._prepare_frag_download(ctx) self._start_frag_download(ctx) @@ -99,7 +108,8 @@ class FragmentFD(FileDownloader): state['eta'] = self.calc_eta( start, time_now, estimated_size, state['downloaded_bytes']) - state['speed'] = s.get('speed') + state['speed'] = s.get('speed') or ctx.get('speed') + ctx['speed'] = state['speed'] ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 2a775bf00..a01dac031 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -1,87 +1,19 @@ from __future__ import unicode_literals -import os +import os.path import re -import subprocess -import sys -from .common import FileDownloader from .fragment import FragmentFD from ..compat import compat_urlparse -from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..utils import ( - encodeArgument, encodeFilename, sanitize_open, - handle_youtubedl_headers, ) -class HlsFD(FileDownloader): - def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - - ffpp = FFmpegPostProcessor(downloader=self) - if not ffpp.available: - self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') - return False - ffpp.check_version() - - args = [ffpp.executable, '-y'] - - if info_dict['http_headers'] and re.match(r'^https?://', url): - # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: - # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - headers = handle_youtubedl_headers(info_dict['http_headers']) - args += [ - '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] - - args += ['-i', url, '-c', 'copy'] - if self.params.get('hls_use_mpegts', False): - args += ['-f', 'mpegts'] - else: - args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] - - args = [encodeArgument(opt) for opt in args] - args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) - - self._debug_cmd(args) - - proc = subprocess.Popen(args, stdin=subprocess.PIPE) - try: - retval = proc.wait() - except KeyboardInterrupt: - # subprocces.run would send the SIGKILL signal to ffmpeg and the - # mp4 file couldn't be played, but if we ask ffmpeg to quit it - # produces a file that is playable (this is mostly useful for live - # streams). Note that Windows is not affected and produces playable - # files (see https://github.com/rg3/youtube-dl/issues/8300). - if sys.platform != 'win32': - proc.communicate(b'q') - raise - if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - }) - return True - else: - self.to_stderr('\n') - self.report_error('%s exited with code %d' % (ffpp.basename, retval)) - return False - - -class NativeHlsFD(FragmentFD): - """ A more limited implementation that does not require ffmpeg """ +class HlsFD(FragmentFD): + """ A limited implementation that does not require ffmpeg """ FD_NAME = 'hlsnative' diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 037654a23..1e4b078a4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -23,7 +23,10 @@ from .alphaporno import AlphaPornoIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anysex import AnySexIE -from .aol import AolIE +from .aol import ( + AolIE, + AolFeaturesIE, +) from .allocine import AllocineIE from .aparat import AparatIE from .appleconnect import AppleConnectIE @@ -51,6 +54,7 @@ from .arte import ( from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE from .audiomack import AudiomackIE, AudiomackAlbumIE from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE @@ -68,6 +72,7 @@ from .bet import BetIE from .bigflix import BigflixIE from .bild import BildIE from .bilibili import BiliBiliIE +from .biobiochiletv import BioBioChileTVIE from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, @@ -77,6 +82,7 @@ from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bpb import BpbIE from .br import BRIE +from .bravotv import BravoTVIE from .breakcom import BreakIE from .brightcove import ( BrightcoveLegacyIE, @@ -103,6 +109,7 @@ from .cbsnews import ( ) from .cbssports import CBSSportsIE from .ccc import CCCIE +from .cda import CDAIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chaturbate import ChaturbateIE @@ -131,6 +138,7 @@ from .collegerama import CollegeRamaIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import RtmpIE from .condenast import CondeNastIE from .cracked import CrackedIE from .crackle import CrackleIE @@ -185,6 +193,10 @@ from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE from .dropbox import DropboxIE +from .dw import ( + DWIE, + DWArticleIE, +) from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE @@ -209,10 +221,7 @@ from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE from .extremetube import ExtremeTubeIE -from .facebook import ( - FacebookIE, - FacebookPostIE, -) +from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE from .fczenit import FczenitIE @@ -277,6 +286,7 @@ from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE from .groupon import GrouponIE from .hark import HarkIE +from .hbo import HBOIE from .hearthisat import HearThisAtIE from .heise import HeiseIE from .hellporno import HellPornoIE @@ -340,6 +350,7 @@ from .konserthusetplay import KonserthusetPlayIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE from .ku6 import Ku6IE +from .kusi import KUSIIE from .kuwo import ( KuwoIE, KuwoAlbumIE, @@ -383,6 +394,7 @@ from .lynda import ( from .m6 import M6IE from .macgamestore import MacGameStoreIE from .mailru import MailRuIE +from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .malemotion import MalemotionIE from .matchtv import MatchTVIE @@ -392,11 +404,13 @@ from .metacritic import MetacriticIE from .mgoon import MgoonIE from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE +from .minoto import MinotoIE from .miomio import MioMioIE from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE from .mixcloud import MixcloudIE from .mlb import MLBIE +from .mnet import MnetIE from .mpora import MporaIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE @@ -522,6 +536,7 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) +from .openload import OpenloadIE from .ora import OraTVIE from .orf import ( ORFTVthekIE, @@ -590,6 +605,7 @@ from .regiotv import RegioTVIE from .restudy import RestudyIE from .reverbnation import ReverbNationIE from .revision3 import Revision3IE +from .rice import RICEIE from .ringtv import RingTVIE from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE @@ -616,6 +632,7 @@ from .ruutu import RuutuIE from .sandia import SandiaIE from .safari import ( SafariIE, + SafariApiIE, SafariCourseIE, ) from .sapo import SapoIE @@ -727,7 +744,9 @@ from .theplatform import ( ThePlatformIE, ThePlatformFeedIE, ) +from .thescene import TheSceneIE from .thesixtyone import TheSixtyOneIE +from .thestar import TheStarIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .tinypic import TinyPicIE @@ -774,6 +793,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, ) +from .tv3 import TV3IE from .tv4 import TV4IE from .tvc import ( TVCIE, @@ -813,6 +833,7 @@ from .udn import UDNEmbedIE from .digiteka import DigitekaIE from .unistra import UnistraIE from .urort import UrortIE +from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE from .ustudio import UstudioIE from .varzesh3 import Varzesh3IE @@ -828,7 +849,10 @@ from .vgtv import ( VGTVIE, ) from .vh1 import VH1IE -from .vice import ViceIE +from .vice import ( + ViceIE, + ViceShowIE, +) from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE @@ -855,6 +879,7 @@ from .vimeo import ( VimeoChannelIE, VimeoGroupsIE, VimeoLikesIE, + VimeoOndemandIE, VimeoReviewIE, VimeoUserIE, VimeoWatchLaterIE, @@ -936,7 +961,9 @@ from .youtube import ( YoutubeChannelIE, YoutubeFavouritesIE, YoutubeHistoryIE, + YoutubeLiveIE, YoutubePlaylistIE, + YoutubePlaylistsIE, YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, @@ -946,7 +973,6 @@ from .youtube import ( YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, - YoutubePlaylistsIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 6a29e587f..b584277be 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -12,7 +12,7 @@ from ..utils import ( class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' + _VALID_URL = r'https?://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index e3e6d2113..fb1cc02e1 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -16,7 +16,7 @@ from ..utils import ( class AddAnimeIE(InfoExtractor): - _VALID_URL = r'http://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' + _VALID_URL = r'https?://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' _TESTS = [{ 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', 'md5': '72954ea10bc979ab5e2eb288b21425a0', diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index e0518cf26..d548592fe 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' + _VALID_URL = r'https?://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' _TEST = { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 5b2c0dc9a..b081695d8 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'http://www\.aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' + _VALID_URL = r'https?://www\.aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', @@ -13,24 +13,18 @@ class AlJazeeraIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Slum - Episode 1: Deliverance', 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader': 'Al Jazeera English', + 'uploader_id': '665003303001', + 'timestamp': 1411116829, + 'upload_date': '20140919', }, - 'add_ie': ['BrightcoveLegacy'], + 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' def _real_extract(self, url): program_name = self._match_id(url) webpage = self._download_webpage(url, program_name) brightcove_id = self._search_regex( r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - - return { - '_type': 'url', - 'url': ( - 'brightcove:' - 'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc' - '&%40videoPlayer={0}'.format(brightcove_id) - ), - 'ie_key': 'BrightcoveLegacy', - } + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index a7d8daf7b..2cede55a7 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,10 +3,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_str, +) from ..utils import ( determine_ext, encode_dict, + extract_attributes, ExtractorError, sanitized_Request, urlencode_postdata, @@ -18,7 +22,7 @@ class AnimeOnDemandIE(InfoExtractor): _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' - _TEST = { + _TESTS = [{ 'url': 'https://www.anime-on-demand.de/anime/161', 'info_dict': { 'id': '161', @@ -26,7 +30,19 @@ class AnimeOnDemandIE(InfoExtractor): 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', }, 'playlist_mincount': 4, - } + }, { + # Film wording is used instead of Episode + 'url': 'https://www.anime-on-demand.de/anime/39', + 'only_matching': True, + }, { + # Episodes without titles + 'url': 'https://www.anime-on-demand.de/anime/162', + 'only_matching': True, + }, { + # ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/169', + 'only_matching': True, + }] def _login(self): (username, password) = self._get_login_info() @@ -36,6 +52,10 @@ class AnimeOnDemandIE(InfoExtractor): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') + if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page: + self.raise_geo_restricted( + '%s is only available in German-speaking countries of Europe' % self.IE_NAME) + login_form = self._form_hidden_inputs('new_user', login_page) login_form.update({ @@ -91,14 +111,22 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] - for episode_html in re.findall(r'(?s)]+class="episodebox-title".+?>Episodeninhalt<', webpage): - m = re.search( - r'class="episodebox-title"[^>]+title="Episode (?P\d+) - (?P.+?)"', episode_html) - if not m: + for num, episode_html in enumerate(re.findall( + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1): + episodebox_title = self._search_regex( + (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', + r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), + episode_html, 'episodebox title', default=None, group='title') + if not episodebox_title: continue - episode_number = int(m.group('number')) - episode_title = m.group('title') + episode_number = int(self._search_regex( + r'(?:Episode|Film)\s*(\d+)', + episodebox_title, 'episode number', default=num)) + episode_title = self._search_regex( + r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', + episodebox_title, 'episode title', default=None) + video_id = 'episode-%d' % episode_number common_info = { @@ -110,33 +138,86 @@ class AnimeOnDemandIE(InfoExtractor): formats = [] - playlist_url = self._search_regex( - r'data-playlist=(["\'])(?P<url>.+?)\1', - episode_html, 'data playlist', default=None, group='url') - if playlist_url: - request = sanitized_Request( - compat_urlparse.urljoin(url, playlist_url), - headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRF-Token': csrf_token, - 'Referer': url, - 'Accept': 'application/json, text/javascript, */*; q=0.01', - }) + for input_ in re.findall( + r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html): + attributes = extract_attributes(input_) + playlist_urls = [] + for playlist_key in ('data-playlist', 'data-otherplaylist'): + playlist_url = attributes.get(playlist_key) + if isinstance(playlist_url, compat_str) and re.match( + r'/?[\da-zA-Z]+', playlist_url): + playlist_urls.append(attributes[playlist_key]) + if not playlist_urls: + continue - playlist = self._download_json( - request, video_id, 'Downloading playlist JSON', fatal=False) - if playlist: - playlist = playlist['playlist'][0] - title = playlist['title'] + lang = attributes.get('data-lang') + lang_note = attributes.get('value') + + for playlist_url in playlist_urls: + kind = self._search_regex( + r'videomaterialurl/\d+/([^/]+)/', + playlist_url, 'media kind', default=None) + format_id_list = [] + if lang: + format_id_list.append(lang) + if kind: + format_id_list.append(kind) + if not format_id_list: + format_id_list.append(compat_str(num)) + format_id = '-'.join(format_id_list) + format_note = ', '.join(filter(None, (kind, lang_note))) + request = sanitized_Request( + compat_urlparse.urljoin(url, playlist_url), + headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRF-Token': csrf_token, + 'Referer': url, + 'Accept': 'application/json, text/javascript, */*; q=0.01', + }) + playlist = self._download_json( + request, video_id, 'Downloading %s playlist JSON' % format_id, + fatal=False) + if not playlist: + continue + start_video = playlist.get('startvideo', 0) + playlist = playlist.get('playlist') + if not playlist or not isinstance(playlist, list): + continue + playlist = playlist[start_video] + title = playlist.get('title') + if not title: + continue description = playlist.get('description') for source in playlist.get('sources', []): file_ = source.get('file') - if file_ and determine_ext(file_) == 'm3u8': - formats = self._extract_m3u8_formats( + if not file_: + continue + ext = determine_ext(file_) + format_id_list = [lang, kind] + if ext == 'm3u8': + format_id_list.append('hls') + elif source.get('type') == 'video/dash' or ext == 'mpd': + format_id_list.append('dash') + format_id = '-'.join(filter(None, format_id_list)) + if ext == 'm3u8': + file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') + entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) + elif source.get('type') == 'video/dash' or ext == 'mpd': + continue + file_formats = self._extract_mpd_formats( + file_, video_id, mpd_id=format_id, fatal=False) + else: + continue + for f in file_formats: + f.update({ + 'language': lang, + 'format_note': format_note, + }) + formats.extend(file_formats) if formats: + self._sort_formats(formats) f = common_info.copy() f.update({ 'title': title, @@ -145,16 +226,18 @@ class AnimeOnDemandIE(InfoExtractor): }) entries.append(f) - m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', - episode_html) - if m: - f = common_info.copy() - f.update({ - 'id': '%s-teaser' % f['id'], - 'title': m.group('title'), - 'url': compat_urlparse.urljoin(url, m.group('href')), - }) - entries.append(f) + # Extract teaser only when full episode is not available + if not formats: + m = re.search( + r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', + episode_html) + if m: + f = common_info.copy() + f.update({ + 'id': '%s-teaser' % f['id'], + 'title': m.group('title'), + 'url': compat_urlparse.urljoin(url, m.group('href')), + }) + entries.append(f) return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b51eafc45..95a99c6b0 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -1,24 +1,11 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'''(?x) - (?: - aol-video:| - http://on\.aol\.com/ - (?: - video/.*-| - playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid= - ) - ) - (?P<id>[0-9]+) - (?:$|\?) - ''' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)' _TESTS = [{ 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', @@ -29,42 +16,31 @@ class AolIE(InfoExtractor): 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', }, 'add_ie': ['FiveMin'], - }, { - 'url': 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316', - 'info_dict': { - 'id': '152147', - 'title': 'Brace Yourself - Today\'s Weirdest News', - }, - 'playlist_mincount': 10, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - playlist_id = mobj.group('playlist_id') - if not playlist_id or self._downloader.params.get('noplaylist'): - return self.url_result('5min:%s' % video_id) + video_id = self._match_id(url) + return self.url_result('5min:%s' % video_id) - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - webpage = self._download_webpage(url, playlist_id) - title = self._html_search_regex( - r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title') - playlist_html = self._search_regex( - r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage, - 'playlist HTML') - entries = [{ - '_type': 'url', - 'url': 'aol-video:%s' % m.group('id'), - 'ie_key': 'Aol', - } for m in re.finditer( - r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>", - playlist_html)] +class AolFeaturesIE(InfoExtractor): + IE_NAME = 'features.aol.com' + _VALID_URL = r'https?://features\.aol\.com/video/(?P<id>[^/?#]+)' - return { - '_type': 'playlist', - 'id': playlist_id, - 'display_id': mobj.group('playlist_display_id'), - 'title': title, - 'entries': entries, - } + _TESTS = [{ + 'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts', + 'md5': '7db483bb0c09c85e241f84a34238cc75', + 'info_dict': { + 'id': '519507715', + 'ext': 'mp4', + 'title': 'What To Watch - February 17, 2016', + }, + 'add_ie': ['FiveMin'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return self.url_result(self._search_regex( + r'<script type="text/javascript" src="(https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js[^"]+)"', + webpage, '5min embed url'), 'FiveMin') diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index efde7e207..ae0f27dcb 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -23,7 +23,7 @@ from ..utils import ( class ArteTvIE(InfoExtractor): - _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' + _VALID_URL = r'https?://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' IE_NAME = 'arte.tv' def _real_extract(self, url): @@ -121,15 +121,18 @@ class ArteTVPlus7IE(InfoExtractor): json_url = compat_parse_qs( compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] if json_url: - return self._extract_from_json_url(json_url, video_id, lang) - # Differend kind of embed URL (e.g. + title = self._search_regex( + r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', + webpage, 'title', default=None, group='title') + return self._extract_from_json_url(json_url, video_id, lang, title=title) + # Different kind of embed URL (e.g. # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) embed_url = self._search_regex( r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', webpage, 'embed url', group='url') return self.url_result(embed_url) - def _extract_from_json_url(self, json_url, video_id, lang): + def _extract_from_json_url(self, json_url, video_id, lang, title=None): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] @@ -137,7 +140,7 @@ class ArteTVPlus7IE(InfoExtractor): if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - title = player_info['VTI'].strip() + title = (player_info.get('VTI') or title or player_info['VID']).strip() subtitle = player_info.get('VSU', '').strip() if subtitle: title += ' - %s' % subtitle diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index 3b2effa15..aa6925623 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -10,9 +10,9 @@ from ..utils import ( class AudiMediaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audimedia\.tv/(?:en|de)/vid/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P<id>[^/?#]+)' _TEST = { - 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test', + 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { 'id': '1565', @@ -32,7 +32,10 @@ class AudiMediaIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - raw_payload = self._search_regex(r'<script[^>]+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload') + raw_payload = self._search_regex([ + r'class="amtv-embed"[^>]+id="([^"]+)"', + r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"', + ], webpage, 'raw payload') _, stage_mode, video_id, lang = raw_payload.split('-') # TODO: handle s and e stage_mode (live streams and ended live streams) @@ -59,13 +62,19 @@ class AudiMediaIE(InfoExtractor): video_version_url = video_version.get('download_url') or video_version.get('stream_url') if not video_version_url: continue - formats.append({ + f = { 'url': video_version_url, 'width': int_or_none(video_version.get('width')), 'height': int_or_none(video_version.get('height')), 'abr': int_or_none(video_version.get('audio_bitrate')), 'vbr': int_or_none(video_version.get('video_bitrate')), - }) + } + bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None) + if bitrate: + f.update({ + 'format_id': 'http-%s' % bitrate, + }) + formats.append(f) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py new file mode 100644 index 000000000..2ec2d7092 --- /dev/null +++ b/youtube_dl/extractor/audioboom.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none + + +class AudioBoomIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0', + 'md5': '63a8d73a055c6ed0f1e51921a10a5a76', + 'info_dict': { + 'id': '4279833', + 'ext': 'mp3', + 'title': '3/09/2016 Czaban Hour 3', + 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans', + 'duration': 2245.72, + 'uploader': 'Steve Czaban', + 'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + clip = None + + clip_store = self._parse_json( + self._search_regex( + r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id, + webpage, 'clip store', default='{}', group='json'), + video_id, fatal=False) + if clip_store: + clips = clip_store.get('clips') + if clips and isinstance(clips, list) and isinstance(clips[0], dict): + clip = clips[0] + + def from_clip(field): + if clip: + clip.get(field) + + audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( + 'audio', webpage, 'audio url') + title = from_clip('title') or self._og_search_title(webpage) + description = from_clip('description') or self._og_search_description(webpage) + + duration = float_or_none(from_clip('duration') or self._html_search_meta( + 'weibo:audio:duration', webpage)) + + uploader = from_clip('author') or self._og_search_property( + 'audio:artist', webpage, 'uploader', fatal=False) + uploader_url = from_clip('author_url') or self._html_search_meta( + 'audioboo:channel', webpage, 'uploader url') + + return { + 'id': video_id, + 'url': audio_url, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_url': uploader_url, + } diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index 011edf128..1805b7312 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -98,7 +98,7 @@ class AzubuIE(InfoExtractor): class AzubuLiveIE(InfoExtractor): - _VALID_URL = r'http://www.azubu.tv/(?P<id>[^/]+)$' + _VALID_URL = r'https?://www.azubu.tv/(?P<id>[^/]+)$' _TEST = { 'url': 'http://www.azubu.tv/MarsTVMDLen', diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py index 76b21e596..234a661d3 100644 --- a/youtube_dl/extractor/baidu.py +++ b/youtube_dl/extractor/baidu.py @@ -9,7 +9,7 @@ from ..utils import unescapeHTML class BaiduVideoIE(InfoExtractor): IE_DESC = '百度视频' - _VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' + _VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' _TESTS = [{ 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', 'info_dict': { diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 9d0dfb961..2dfcee98d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -10,7 +10,6 @@ from ..utils import ( int_or_none, parse_duration, parse_iso8601, - remove_end, unescapeHTML, ) from ..compat import ( @@ -561,7 +560,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', - 'title': 'BBC Blogs - Adam Curtis - BUGGER', + 'title': 'BUGGER', }, 'playlist_count': 18, }, { @@ -670,9 +669,17 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', - 'title': 'What Liverpool can expect from Klopp', + 'title': 'Jurgen Klopp: Furious football from a witty and winning coach', }, 'playlist_count': 3, + }, { + # school report article with single video + 'url': 'http://www.bbc.co.uk/schoolreport/35744779', + 'info_dict': { + 'id': '35744779', + 'title': 'School which breaks down barriers in Jerusalem', + }, + 'playlist_count': 1, }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', @@ -735,8 +742,17 @@ class BBCIE(BBCCoUkIE): json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) timestamp = json_ld_info.get('timestamp') + playlist_title = json_ld_info.get('title') - playlist_description = json_ld_info.get('description') + if not playlist_title: + playlist_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'<title>(.+?)', webpage, 'playlist title', default=None) + if playlist_title: + playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + + playlist_description = json_ld_info.get( + 'description') or self._og_search_description(webpage, default=None) if not timestamp: timestamp = parse_iso8601(self._search_regex( @@ -797,8 +813,6 @@ class BBCIE(BBCCoUkIE): playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) if entries: - playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News') - playlist_description = playlist_description or self._og_search_description(webpage, default=None) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) @@ -829,10 +843,6 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } - playlist_title = self._html_search_regex( - r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') - playlist_description = self._og_search_description(webpage, default=None) - def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), @@ -932,7 +942,7 @@ class BBCIE(BBCCoUkIE): class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py index 1bdc25812..9bca853b3 100644 --- a/youtube_dl/extractor/behindkink.py +++ b/youtube_dl/extractor/behindkink.py @@ -8,7 +8,7 @@ from ..utils import url_basename class BehindKinkIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/(?P[^/#?_]+)' + _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/(?P[^/#?_]+)' _TEST = { 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', 'md5': '507b57d8fdcd75a41a9a7bdb7989c762', diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 59beb11bc..8baff2041 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -14,7 +14,7 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P\d+)(?:/index_(?P\d+).html)?' + _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P\d+)(?:/index_(?P\d+).html)?' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py new file mode 100644 index 000000000..133228133 --- /dev/null +++ b/youtube_dl/extractor/biobiochiletv.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class BioBioChileTVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?:[^/]+/)+(?P[^/]+)\.shtml' + + _TESTS = [{ + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', + 'md5': '26f51f03cf580265defefb4518faec09', + 'info_dict': { + 'id': 'sobre-camaras-y-camarillas-parlamentarias', + 'ext': 'mp4', + 'title': 'Sobre Cámaras y camarillas parlamentarias', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Fernando Atria', + }, + }, { + # different uploader layout + 'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml', + 'md5': 'edc2e6b58974c46d5b047dea3c539ff3', + 'info_dict': { + 'id': 'natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades', + 'ext': 'mp4', + 'title': 'Natalia Valdebenito repasa a diputado Hasbún: Pasó a la categoría de hablar brutalidades', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Piangella Obrador', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV') + + file_url = self._search_regex( + r'loadFWPlayerVideo\([^,]+,\s*(["\'])(?P.+?)\1', + webpage, 'file url', group='url') + + base_url = self._search_regex( + r'file\s*:\s*(["\'])(?P.+?)\1\s*\+\s*fileURL', webpage, + 'base url', default='http://unlimited2-cl.digitalproserver.com/bbtv/', + group='url') + + formats = self._extract_m3u8_formats( + '%s%s/playlist.m3u8' % (base_url, file_url), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + f = { + 'url': '%s%s' % (base_url, file_url), + 'format_id': 'http', + 'protocol': 'http', + 'preference': 1, + } + if formats: + f_copy = formats[-1].copy() + f_copy.update(f) + f = f_copy + formats.append(f) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + uploader = self._html_search_regex( + r']+href=["\']https?://busca\.biobiochile\.cl/author[^>]+>(.+?)', + webpage, 'uploader', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'formats': formats, + } diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index 38bda3af5..7a8e1f60b 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -28,10 +28,10 @@ class BleacherReportIE(InfoExtractor): 'add_ie': ['Ooyala'], }, { 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', - 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', + 'md5': '6a5cd403418c7b01719248ca97fb0692', 'info_dict': { 'id': '2586817', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', 'timestamp': 1446839961, 'uploader': 'Sean Fay', @@ -93,10 +93,14 @@ class BleacherReportCMSIE(AMPIE): 'md5': '8c2c12e3af7805152675446c905d159b', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py index 122a1cbb6..86a7f4d7d 100644 --- a/youtube_dl/extractor/bokecc.py +++ b/youtube_dl/extractor/bokecc.py @@ -33,7 +33,7 @@ class BokeCCBaseIE(InfoExtractor): class BokeCCIE(BokeCCBaseIE): _IE_DESC = 'CC视频' - _VALID_URL = r'http://union\.bokecc\.com/playvideo\.bo\?(?P.*)' + _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P.*)' _TESTS = [{ 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B', diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index c28e72927..6ad45a1e6 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -12,7 +12,7 @@ from ..utils import ( class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'http://www\.bpb\.de/mediathek/(?P[0-9]+)/' + _VALID_URL = r'https?://www\.bpb\.de/mediathek/(?P[0-9]+)/' _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py new file mode 100644 index 000000000..34d451f38 --- /dev/null +++ b/youtube_dl/extractor/bravotv.py @@ -0,0 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class BravoTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+videos/(?P[^/?]+)' + _TEST = { + 'url': 'http://www.bravotv.com/last-chance-kitchen/season-5/videos/lck-ep-12-fishy-finale', + 'md5': 'd60cdf68904e854fac669bd26cccf801', + 'info_dict': { + 'id': 'LitrBdX64qLn', + 'ext': 'mp4', + 'title': 'Last Chance Kitchen Returns', + 'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid') + release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid') + return self.url_result(smuggle_url( + 'http://link.theplatform.com/s/%s/%s?mbr=true&switch=progressive' % (account_pid, release_pid), + {'force_smil_url': True}), 'ThePlatform', release_pid) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index aa08051b1..725859b4d 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -11,7 +11,7 @@ from ..utils import ( class BreakIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P\d+)' _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', 'info_dict': { diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index c947337f9..59e8008f9 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,10 +9,10 @@ from ..compat import ( compat_etree_fromstring, compat_parse_qs, compat_str, - compat_urllib_parse, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, + compat_HTTPError, ) from ..utils import ( determine_ext, @@ -23,16 +23,16 @@ from ..utils import ( js_to_json, int_or_none, parse_iso8601, - sanitized_Request, unescapeHTML, unsmuggle_url, + update_url_query, ) class BrightcoveLegacyIE(InfoExtractor): IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P.*)' - _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' + _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated' _TESTS = [ { @@ -155,8 +155,8 @@ class BrightcoveLegacyIE(InfoExtractor): # Not all pages define this value if playerKey is not None: params['playerKey'] = playerKey - # The three fields hold the id of the video - videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') + # These fields hold the id of the video + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') if videoPlayer is not None: params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') @@ -184,8 +184,7 @@ class BrightcoveLegacyIE(InfoExtractor): @classmethod def _make_brightcove_url(cls, params): - data = compat_urllib_parse.urlencode(params) - return cls._FEDERATED_URL_TEMPLATE % data + return update_url_query(cls._FEDERATED_URL, params) @classmethod def _extract_brightcove_url(cls, webpage): @@ -239,7 +238,7 @@ class BrightcoveLegacyIE(InfoExtractor): # We set the original url as the default 'Referer' header referer = smuggled_data.get('Referer', url) return self._get_video_info( - videoPlayer[0], query_str, query, referer=referer) + videoPlayer[0], query, referer=referer) elif 'playerKey' in query: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) @@ -248,15 +247,14 @@ class BrightcoveLegacyIE(InfoExtractor): 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', expected=True) - def _get_video_info(self, video_id, query_str, query, referer=None): - request_url = self._FEDERATED_URL_TEMPLATE % query_str - req = sanitized_Request(request_url) + def _get_video_info(self, video_id, query, referer=None): + headers = {} linkBase = query.get('linkBaseURL') if linkBase is not None: referer = linkBase[0] if referer is not None: - req.add_header('Referer', referer) - webpage = self._download_webpage(req, video_id) + headers['Referer'] = referer + webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query) error_msg = self._html_search_regex( r"

We're sorry.

([\s\n]*

.*?

)+", webpage, @@ -355,7 +353,7 @@ class BrightcoveLegacyIE(InfoExtractor): class BrightcoveNewIE(InfoExtractor): IE_NAME = 'brightcove:new' - _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*videoId=(?P(?:ref:)?\d+)' + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*videoId=(?P\d+|ref:[^&]+)' _TESTS = [{ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -391,6 +389,10 @@ class BrightcoveNewIE(InfoExtractor): # ref: prefixed video id 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', 'only_matching': True, + }, { + # non numeric ref: prefixed video id + 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356', + 'only_matching': True, }] @staticmethod @@ -410,8 +412,8 @@ class BrightcoveNewIE(InfoExtractor): # Look for iframe embeds [1] for _, url in re.findall( - r']+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): - entries.append(url) + r']+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(url if url.startswith('http') else 'http:' + url) # Look for embed_in_page embeds [2] for video_id, account_id, player_id, embed in re.findall( @@ -420,11 +422,11 @@ class BrightcoveNewIE(InfoExtractor): # According to [4] data-video-id may be prefixed with ref: r'''(?sx) ]+ - data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? + data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*? .*? ]+ src=["\'](?:https?:)?//players\.brightcove\.net/ - (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js + (\d+)/([\da-f-]+)_([^/]+)/index(?:\.min)?\.js ''', webpage): entries.append( 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' @@ -454,24 +456,33 @@ class BrightcoveNewIE(InfoExtractor): r'policyKey\s*:\s*(["\'])(?P.+?)\1', webpage, 'policy key', group='pk') - req = sanitized_Request( - 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' - % (account_id, video_id), - headers={'Accept': 'application/json;pk=%s' % policy_key}) - json_data = self._download_json(req, video_id) + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) + try: + json_data = self._download_json(api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + json_data = self._parse_json(e.cause.read().decode(), video_id) + raise ExtractorError(json_data[0]['message'], expected=True) + raise title = json_data['name'] formats = [] for source in json_data.get('sources', []): + container = source.get('container') source_type = source.get('type') src = source.get('src') - if source_type == 'application/x-mpegURL': + if source_type == 'application/x-mpegURL' or container == 'M2TS': if not src: continue formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + src, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif source_type == 'application/dash+xml': + if not src: + continue + formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) else: streaming_src = source.get('streaming_src') stream_name, app_name = source.get('stream_name'), source.get('app_name') @@ -479,15 +490,23 @@ class BrightcoveNewIE(InfoExtractor): continue tbr = float_or_none(source.get('avg_bitrate'), 1000) height = int_or_none(source.get('height')) + width = int_or_none(source.get('width')) f = { 'tbr': tbr, - 'width': int_or_none(source.get('width')), - 'height': height, 'filesize': int_or_none(source.get('size')), - 'container': source.get('container'), - 'vcodec': source.get('codec'), - 'ext': source.get('container').lower(), + 'container': container, + 'ext': container.lower(), } + if width == 0 and height == 0: + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': width, + 'height': height, + 'vcodec': source.get('codec'), + }) def build_format_id(kind): format_id = kind diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 897f3a104..dd4d96cec 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -16,7 +16,7 @@ from ..utils import ( class CamdemyIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?camdemy\.com/media/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P\d+)' _TESTS = [{ # single file 'url': 'http://www.camdemy.com/media/5181/', @@ -104,7 +104,7 @@ class CamdemyIE(InfoExtractor): class CamdemyFolderIE(InfoExtractor): - _VALID_URL = r'http://www.camdemy.com/folder/(?P\d+)' + _VALID_URL = r'https?://www.camdemy.com/folder/(?P\d+)' _TESTS = [{ # links with trailing slash 'url': 'http://www.camdemy.com/folder/450', diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 7319ee1b7..f23bac9a1 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -11,7 +11,7 @@ from ..utils import ( class CBSNewsIE(ThePlatformIE): IE_DESC = 'CBS News' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' _TESTS = [ { @@ -78,7 +78,7 @@ class CBSNewsIE(ThePlatformIE): pid = item.get('media' + format_id) if not pid: continue - release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?format=SMIL&mbr=true' % pid + release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) @@ -96,7 +96,7 @@ class CBSNewsIE(ThePlatformIE): class CBSNewsLiveVideoIE(InfoExtractor): IE_DESC = 'CBS News Live Videos' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/live/video/(?P[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[\da-z_-]+)' _TEST = { 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index ae47e74cc..549ae32f3 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class CBSSportsIE(InfoExtractor): - _VALID_URL = r'http://www\.cbssports\.com/video/player/(?P
[^/]+)/(?P[^/]+)' + _VALID_URL = r'https?://www\.cbssports\.com/video/player/(?P
[^/]+)/(?P[^/]+)' _TEST = { 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s', diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py new file mode 100755 index 000000000..498d2c0d8 --- /dev/null +++ b/youtube_dl/extractor/cda.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + parse_duration +) + + +class CDAIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.cda.pl/video/5749950c', + 'md5': '6f844bf51b15f31fae165365707ae970', + 'info_dict': { + 'id': '5749950c', + 'ext': 'mp4', + 'height': 720, + 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'duration': 39 + } + }, { + 'url': 'http://www.cda.pl/video/57413289', + 'md5': 'a88828770a8310fc00be6c95faf7f4d5', + 'info_dict': { + 'id': '57413289', + 'ext': 'mp4', + 'title': 'Lądowanie na lotnisku na Maderze', + 'duration': 137 + } + }, { + 'url': 'http://ebd.cda.pl/0x0/5749950c', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id) + + if 'Ten film jest dostępny dla użytkowników premium' in webpage: + raise ExtractorError('This video is only available for premium users.', expected=True) + + title = self._html_search_regex(r'(.+?)', webpage, 'title') + + formats = [] + + info_dict = { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': None, + } + + def extract_format(page, version): + unpacked = decode_packed_codes(page) + format_url = self._search_regex( + r"url:\\'(.+?)\\'", unpacked, '%s url' % version, fatal=False) + if not format_url: + return + f = { + 'url': format_url, + } + m = re.search( + r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', + page) + if m: + f.update({ + 'format_id': m.group('format_id'), + 'height': int(m.group('height')), + }) + info_dict['formats'].append(f) + if not info_dict['duration']: + info_dict['duration'] = parse_duration(self._search_regex( + r"duration:\\'(.+?)\\'", unpacked, 'duration', fatal=False)) + + extract_format(webpage, 'default') + + for href, resolution in re.findall( + r']+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', + webpage): + webpage = self._download_webpage( + href, video_id, 'Downloading %s version information' % resolution, fatal=False) + if not webpage: + # Manually report warning because empty page is returned when + # invalid version is requested. + self.report_warning('Unable to download %s version information' % resolution) + continue + extract_format(webpage, resolution) + + self._sort_formats(formats) + + return info_dict diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index b27b4e670..b355111cb 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -129,7 +129,8 @@ class CeskaTelevizeIE(InfoExtractor): formats = [] for format_id, stream_url in item['streamUrls'].items(): formats.extend(self._extract_m3u8_formats( - stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native')) + stream_url, playlist_id, 'mp4', + entry_protocol='m3u8_native', fatal=False)) self._sort_formats(formats) item_id = item.get('id') or item['assetId'] diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 6d9cd8abd..042c4f2f1 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -21,6 +21,10 @@ class CinemassacreIE(InfoExtractor): 'title': '“Angry Video Game Nerd: The Movie” – Trailer', 'description': 'md5:fb87405fcb42a331742a0dce2708560b', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', @@ -31,14 +35,18 @@ class CinemassacreIE(InfoExtractor): 'upload_date': '20131002', 'title': 'The Mummy’s Hand (1940)', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { # Youtube embedded video 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', - 'md5': 'df4cf8a1dcedaec79a73d96d83b99023', + 'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9', 'info_dict': { 'id': 'OEVzPCY2T-g', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', 'upload_date': '20061207', 'uploader': 'Cinemassacre', @@ -49,12 +57,12 @@ class CinemassacreIE(InfoExtractor): { # Youtube embedded video 'url': 'http://cinemassacre.com/2006/09/01/mckids/', - 'md5': '6eb30961fa795fedc750eac4881ad2e1', + 'md5': '7393c4e0f54602ad110c793eb7a6513a', 'info_dict': { 'id': 'FnxsNhuikpo', - 'ext': 'mp4', + 'ext': 'webm', 'upload_date': '20060901', - 'uploader': 'Cinemassacre Extras', + 'uploader': 'Cinemassacre Extra', 'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53', 'uploader_id': 'Cinemassacre', 'title': 'AVGN: McKids', @@ -69,7 +77,11 @@ class CinemassacreIE(InfoExtractor): 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', 'upload_date': '20150525', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } ] diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 2996b6b09..19f8b397e 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -19,7 +19,7 @@ def _decode(s): class CliphunterIE(InfoExtractor): IE_NAME = 'cliphunter' - _VALID_URL = r'''(?x)http://(?:www\.)?cliphunter\.com/w/ + _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/ (?P[0-9]+)/ (?P.+?)(?:$|[#\?]) ''' diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index 8306d6fb7..0b6ad895f 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -8,7 +8,7 @@ from ..utils import ( class ClipsyndicateIE(InfoExtractor): - _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' + _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' _TESTS = [{ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 1dfa7c12e..2fba93543 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -12,7 +12,7 @@ from ..utils import ( class ClubicIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P[0-9]+)\.html' _TESTS = [{ 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 5c3908f72..c154b3e19 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -51,9 +51,7 @@ class CNETIE(ThePlatformIE): uploader = None uploader_id = None - mpx_account = data['config']['uvpConfig']['default']['mpx_account'] - - metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id) + metadata = self.get_metadata('kYEXFC/%s' % list(vdata['files'].values())[0], video_id) description = vdata.get('description') or metadata.get('description') duration = int_or_none(vdata.get('duration')) or metadata.get('duration') @@ -62,7 +60,7 @@ class CNETIE(ThePlatformIE): for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue - release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid) + release_url = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' % vid if fkey == 'hds': release_url += '&manifest=f4m' tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 7dff68492..e697d1410 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -11,7 +11,7 @@ from ..utils import ( class ComCarCoffIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' + _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' _TESTS = [{ 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/', 'info_dict': { diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 402f2f436..770105a5b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,13 +15,14 @@ import math from ..compat import ( compat_cookiejar, compat_cookies, + compat_etree_fromstring, compat_getpass, compat_http_client, + compat_os_name, + compat_str, compat_urllib_error, compat_urllib_parse, compat_urlparse, - compat_str, - compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -47,6 +48,7 @@ from ..utils import ( determine_protocol, parse_duration, mimetype2ext, + update_url_query, ) @@ -104,7 +106,7 @@ class InfoExtractor(object): * protocol The protocol that will be used for the actual download, lower-case. "http", "https", "rtsp", "rtmp", "rtmpe", - "m3u8", or "m3u8_native". + "m3u8", "m3u8_native" or "http_dash_segments". * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. @@ -344,7 +346,7 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None): """ Returns the response handle """ if note is None: self.report_download_webpage(video_id) @@ -353,6 +355,12 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) + # data, headers and query params will be ignored for `Request` objects + if isinstance(url_or_request, compat_str): + if query: + url_or_request = update_url_query(url_or_request, query) + if data or headers: + url_or_request = sanitized_Request(url_or_request, data, headers or {}) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -368,13 +376,13 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal return False @@ -427,7 +435,7 @@ class InfoExtractor(object): self.to_screen('Saving request to ' + filename) # Working around MAX_PATH limitation on Windows (see # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if os.name == 'nt': + if compat_os_name == 'nt': absfilepath = os.path.abspath(filename) if len(absfilepath) > 259: filename = '\\\\?\\' + absfilepath @@ -461,13 +469,13 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None): """ Returns the data of the page as a string """ success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -482,10 +490,10 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None): + transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) + url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) if xml_string is False: return xml_string if transform_source: @@ -496,10 +504,10 @@ class InfoExtractor(object): note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None): + fatal=True, encoding=None, data=None, headers=None, query=None): json_string = self._download_webpage( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding) + encoding=encoding, data=data, headers=headers, query=query) if (not fatal) and json_string is False: return None return self._parse_json( @@ -596,7 +604,7 @@ class InfoExtractor(object): if mobj: break - if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty(): + if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): _name = '\033[0;34m%s\033[0m' % name else: _name = name @@ -854,6 +862,7 @@ class InfoExtractor(object): proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 if f.get('vcodec') == 'none': # audio only + preference -= 50 if self._downloader.params.get('prefer_free_formats'): ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] else: @@ -864,6 +873,8 @@ class InfoExtractor(object): except ValueError: audio_ext_preference = -1 else: + if f.get('acodec') == 'none': # video only + preference -= 40 if self._downloader.params.get('prefer_free_formats'): ORDER = ['flv', 'mp4', 'webm'] else: @@ -965,6 +976,13 @@ class InfoExtractor(object): if manifest is False: return [] + return self._parse_f4m_formats( + manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, + transform_source=transform_source, fatal=fatal) + + def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): formats = [] manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') @@ -990,7 +1008,8 @@ class InfoExtractor(object): # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': formats.extend(self._extract_f4m_formats( - manifest_url, video_id, preference, f4m_id, fatal=fatal)) + manifest_url, video_id, preference=preference, f4m_id=f4m_id, + transform_source=transform_source, fatal=fatal)) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -1139,8 +1158,8 @@ class InfoExtractor(object): out.append('{%s}%s' % (namespace, c)) return '/'.join(out) - def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) + def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) if smil is False: assert not fatal @@ -1157,10 +1176,10 @@ class InfoExtractor(object): return {} return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) - def _download_smil(self, smil_url, video_id, fatal=True): + def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): return self._download_xml( smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal) + 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) @@ -1446,8 +1465,9 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) - mime_type = representation_attrib.get('mimeType') - content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType') + # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory + mime_type = representation_attrib['mimeType'] + content_type = mime_type.split('/')[0] if content_type == 'text': # TODO implement WebVTT downloading pass @@ -1470,6 +1490,7 @@ class InfoExtractor(object): f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, + 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py new file mode 100644 index 000000000..5d130a170 --- /dev/null +++ b/youtube_dl/extractor/commonprotocols.py @@ -0,0 +1,36 @@ +from __future__ import unicode_literals + +import os + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urlparse, +) +from ..utils import url_basename + + +class RtmpIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'(?i)rtmp[est]?://.+' + + _TESTS = [{ + 'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4', + 'only_matching': True, + }, { + 'url': 'rtmp://edge.live.hitbox.tv/live/dimak', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + title = compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + return { + 'id': video_id, + 'title': title, + 'formats': [{ + 'url': url, + 'ext': 'flv', + 'format_id': compat_urlparse.urlparse(url).scheme, + }], + } diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 6f92ae2ed..054978ff2 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -45,7 +45,7 @@ class CondeNastIE(InfoExtractor): 'wmagazine': 'W Magazine', } - _VALID_URL = r'http://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'https?://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) EMBED_URL = r'(?:https?:)?//player\.(?P%s)\.com/(?Pembed(?:js)?)/.+?' % '|'.join(_SITES.keys()) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index c7032ffa2..85fa7a725 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -54,7 +54,7 @@ class CrunchyrollBaseIE(InfoExtractor): def _real_initialize(self): self._login() - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + def _download_webpage(self, url_or_request, *args, **kwargs): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) else sanitized_Request(url_or_request)) # Accept-Language must be set explicitly to accept any language to avoid issues @@ -65,8 +65,7 @@ class CrunchyrollBaseIE(InfoExtractor): # Crunchyroll to not work in georestriction cases in some browsers that don't place # the locale lang first in header. However allowing any language seems to workaround the issue. request.add_header('Accept-Language', '*') - return super(CrunchyrollBaseIE, self)._download_webpage( - request, video_id, note, errnote, fatal, tries, timeout, encoding) + return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) @staticmethod def _add_skip_wall(url): diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index b8b9d058d..84b36f44c 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -15,7 +15,7 @@ from .senateisvp import SenateISVPIE class CSpanIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' + _VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py index 45049bf37..1622fc844 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/youtube_dl/extractor/ctsnews.py @@ -8,7 +8,7 @@ from ..utils import parse_iso8601, ExtractorError class CtsNewsIE(InfoExtractor): IE_DESC = '華視新聞' # https connection failed (Connection reset) - _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P\d+)\.html' + _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P\d+)\.html' _TESTS = [{ 'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html', 'md5': 'a9875cb790252b08431186d741beaabe', diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index aa2c09eb6..9099f5046 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -6,7 +6,7 @@ from ..compat import compat_str class DctpTvIE(InfoExtractor): - _VALID_URL = r'http://www.dctp.tv/(#/)?filme/(?P.+?)/$' + _VALID_URL = r'https?://www.dctp.tv/(#/)?filme/(?P.+?)/$' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py index 98e3aedfd..9fe144e14 100644 --- a/youtube_dl/extractor/defense.py +++ b/youtube_dl/extractor/defense.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class DefenseGouvFrIE(InfoExtractor): IE_NAME = 'defense.gouv.fr' - _VALID_URL = r'http://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P[^/?#]*)' + _VALID_URL = r'https?://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P[^/?#]*)' _TEST = { 'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1', diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index ce680a9f3..fdce1429a 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -9,7 +9,7 @@ from ..compat import compat_str class DiscoveryIE(InfoExtractor): - _VALID_URL = r'''(?x)http://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: discovery| investigationdiscovery| discoverylife| diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 373b3b4b4..3915cb182 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -10,7 +10,7 @@ from ..compat import (compat_str, compat_basestring) class DouyuTVIE(InfoExtractor): IE_DESC = '斗鱼' - _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { @@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor): 'display_id': 'iseven', 'ext': 'flv', 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:c93d6692dde6fe33809a46edcbecca44', + 'description': 'md5:f34981259a03e980a3c6404190a3ed61', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', 'uploader_id': '431925', @@ -26,7 +26,7 @@ class DouyuTVIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'http://www.douyutv.com/85982', 'info_dict': { @@ -42,7 +42,27 @@ class DouyuTVIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Romm not found', + }, { + 'url': 'http://www.douyutv.com/17732', + 'info_dict': { + 'id': '17732', + 'display_id': '17732', + 'ext': 'flv', + 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:f34981259a03e980a3c6404190a3ed61', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': '7师傅', + 'uploader_id': '431925', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.douyu.com/xiaocang', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index a638c827c..1e7dcada6 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -10,7 +10,7 @@ from ..utils import int_or_none class DPlayIE(InfoExtractor): - _VALID_URL = r'http://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 028144f20..0040e70d4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -7,7 +7,7 @@ from .zdf import ZDFIE class DreiSatIE(ZDFIE): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' + _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' _TESTS = [ { 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index c1a4bc757..974c69dbc 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -15,7 +15,7 @@ class DVTVIE(InfoExtractor): IE_NAME = 'dvtv' IE_DESC = 'http://video.aktualne.cz/' - _VALID_URL = r'http://video\.aktualne\.cz/(?:[^/]+/)+r~(?P[0-9a-f]{32})' + _VALID_URL = r'https?://video\.aktualne\.cz/(?:[^/]+/)+r~(?P[0-9a-f]{32})' _TESTS = [{ 'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/', diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py new file mode 100644 index 000000000..b6c985547 --- /dev/null +++ b/youtube_dl/extractor/dw.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none +from ..compat import compat_urlparse + + +class DWIE(InfoExtractor): + IE_NAME = 'dw' + _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P\d+)' + _TESTS = [{ + # video + 'url': 'http://www.dw.com/en/intelligent-light/av-19112290', + 'md5': '7372046e1815c5a534b43f3c3c36e6e9', + 'info_dict': { + 'id': '19112290', + 'ext': 'mp4', + 'title': 'Intelligent light', + 'description': 'md5:90e00d5881719f2a6a5827cb74985af1', + 'upload_date': '20160311', + } + }, { + # audio + 'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941', + 'md5': '2814c9a1321c3a51f8a7aeb067a360dd', + 'info_dict': { + 'id': '19111941', + 'ext': 'mp3', + 'title': 'WorldLink: My business', + 'description': 'md5:bc9ca6e4e063361e21c920c53af12405', + 'upload_date': '20160311', + } + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) + hidden_inputs = self._hidden_inputs(webpage) + title = hidden_inputs['media_title'] + + formats = [] + if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': + formats = self._extract_smil_formats( + 'http://www.dw.com/smil/v-%s' % media_id, media_id, + transform_source=lambda s: s.replace( + 'rtmp://tv-od.dw.de/flash/', + 'http://tv-download.dw.de/dwtv_video/flv/')) + else: + formats = [{'url': hidden_inputs['file_name']}] + + return { + 'id': media_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': hidden_inputs.get('preview_image'), + 'duration': int_or_none(hidden_inputs.get('file_duration')), + 'upload_date': hidden_inputs.get('display_date'), + 'formats': formats, + } + + +class DWArticleIE(InfoExtractor): + IE_NAME = 'dw:article' + _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P\d+)' + _TEST = { + 'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009', + 'md5': '8ca657f9d068bbef74d6fc38b97fc869', + 'info_dict': { + 'id': '19105868', + 'ext': 'mp4', + 'title': 'The harsh life of refugees in Idomeni', + 'description': 'md5:196015cc7e48ebf474db9399420043c7', + 'upload_date': '20160310', + } + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + hidden_inputs = self._hidden_inputs(webpage) + media_id = hidden_inputs['media_id'] + media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url') + media_url = compat_urlparse.urljoin(url, media_path) + return self.url_result(media_url, 'DW', media_id) diff --git a/youtube_dl/extractor/echomsk.py b/youtube_dl/extractor/echomsk.py index d2d94049d..6b7cc652f 100644 --- a/youtube_dl/extractor/echomsk.py +++ b/youtube_dl/extractor/echomsk.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class EchoMskIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?echo\.msk\.ru/sounds/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?echo\.msk\.ru/sounds/(?P\d+)' _TEST = { 'url': 'http://www.echo.msk.ru/sounds/1464134.html', 'md5': '2e44b3b78daff5b458e4dbc37f191f7c', diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py index 00a69e631..8c725a4e6 100644 --- a/youtube_dl/extractor/elpais.py +++ b/youtube_dl/extractor/elpais.py @@ -9,7 +9,7 @@ class ElPaisIE(InfoExtractor): _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P[^/#?]+)\.html(?:$|[?#])' IE_DESC = 'El País' - _TEST = { + _TESTS = [{ 'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html', 'md5': '98406f301f19562170ec071b83433d55', 'info_dict': { @@ -19,30 +19,41 @@ class ElPaisIE(InfoExtractor): 'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.', 'upload_date': '20140206', } - } + }, { + 'url': 'http://elcomidista.elpais.com/elcomidista/2016/02/24/articulo/1456340311_668921.html#?id_externo_nwl=newsletter_diaria20160303t', + 'md5': '3bd5b09509f3519d7d9e763179b013de', + 'info_dict': { + 'id': '1456340311_668921', + 'ext': 'mp4', + 'title': 'Cómo hacer el mejor café con cafetera italiana', + 'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.', + 'upload_date': '20160303', + } + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) prefix = self._html_search_regex( - r'var url_cache = "([^"]+)";', webpage, 'URL prefix') + r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix') video_suffix = self._search_regex( - r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL') + r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL') video_url = prefix + video_suffix thumbnail_suffix = self._search_regex( - r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL', - fatal=False) + r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", + webpage, 'thumbnail URL', fatal=False) thumbnail = ( None if thumbnail_suffix is None else prefix + thumbnail_suffix) title = self._html_search_regex( - '

', - webpage, 'upload date', fatal=False) - upload_date = (None if date_str is None else unified_strdate(date_str)) + webpage, 'upload date', default=None) or self._html_search_meta( + 'datePublished', webpage, 'timestamp')) return { 'id': video_id, diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index e4180701d..e5e57d485 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -1,21 +1,13 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - url_basename, -) class EngadgetIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://www.engadget.com/ - (?:video(?:/5min)?/(?P\d+)| - [\d/]+/.*?) - ''' + _VALID_URL = r'https?://www.engadget.com/video/(?P\d+)' _TEST = { - 'url': 'http://www.engadget.com/video/5min/518153925/', + 'url': 'http://www.engadget.com/video/518153925/', 'md5': 'c6820d4828a5064447a4d9fc73f312c9', 'info_dict': { 'id': '518153925', @@ -27,15 +19,4 @@ class EngadgetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - if video_id is not None: - return self.url_result('5min:%s' % video_id) - else: - title = url_basename(url) - webpage = self._download_webpage(url, title) - ids = re.findall(r']+?playList=(\d+)', webpage) - return { - '_type': 'playlist', - 'title': title, - 'entries': [self.url_result('5min:%s' % vid) for vid in ids] - } + return self.url_result('5min:%s' % video_id) diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index 0c0fe6d65..09ed4f2b5 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class ExfmIE(InfoExtractor): IE_NAME = 'exfm' IE_DESC = 'ex.fm' - _VALID_URL = r'http://(?:www\.)?ex\.fm/song/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?ex\.fm/song/(?P[^/]+)' _SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' _TESTS = [ { diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 6c6c3b1bd..f5bbd39d2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -37,7 +37,9 @@ class FacebookIE(InfoExtractor): video/embed| story\.php )\?(?:.*?)(?:v|video_id|story_fbid)=| - [^/]+/videos/(?:[^/]+/)? + [^/]+/videos/(?:[^/]+/)?| + [^/]+/posts/| + groups/[^/]+/permalink/ )| facebook: ) @@ -50,6 +52,8 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' + _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' + _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'md5': '6a40d33c0eccbb1af76cf0485a052659', @@ -81,6 +85,33 @@ class FacebookIE(InfoExtractor): 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', 'uploader': 'Demy de Zeeuw', }, + }, { + 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', + 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', + 'info_dict': { + 'id': '544765982287235', + 'ext': 'mp4', + 'title': '"What are you doing running in the snow?"', + 'uploader': 'FailArmy', + } + }, { + 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', + 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3', + 'info_dict': { + 'id': '1035862816472149', + 'ext': 'mp4', + 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog', + 'uploader': 'S. Saint', + }, + }, { + 'note': 'swf params escaped', + 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749', + 'md5': '97ba073838964d12c70566e0085c2b91', + 'info_dict': { + 'id': '10153664894881749', + 'ext': 'mp4', + 'title': 'Facebook video #10153664894881749', + }, }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -94,7 +125,7 @@ class FacebookIE(InfoExtractor): 'url': 'facebook:544765982287235', 'only_matching': True, }, { - 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', + 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, }] @@ -164,19 +195,19 @@ class FacebookIE(InfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url): - video_id = self._match_id(url) - req = sanitized_Request('https://www.facebook.com/video/video.php?v=%s' % video_id) + def _extract_from_url(self, url, video_id, fatal_if_no_video=True): + req = sanitized_Request(url) req.add_header('User-Agent', self._CHROME_USER_AGENT) webpage = self._download_webpage(req, video_id) video_data = None - BEFORE = '{swf.addParam(param[0], param[1]);});\n' + BEFORE = '{swf.addParam(param[0], param[1]);});' AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' - m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage) + m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage) if m: - data = dict(json.loads(m.group(1))) + swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"') + data = dict(json.loads(swf_params)) params_raw = compat_urllib_parse_unquote(data['params']) video_data = json.loads(params_raw)['video_data'] @@ -189,13 +220,15 @@ class FacebookIE(InfoExtractor): if not video_data: server_js_data = self._parse_json(self._search_regex( - r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id) + r'handleServerJS\(({.+})\);', webpage, 'server js data', default='{}'), video_id) for item in server_js_data.get('instances', []): if item[1][0] == 'VideoConfig': video_data = video_data_list2dict(item[2][0]['videoData']) break if not video_data: + if not fatal_if_no_video: + return webpage, False m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) if m_msg is not None: raise ExtractorError( @@ -241,39 +274,36 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) - return { + info_dict = { 'id': video_id, 'title': video_title, 'formats': formats, 'uploader': uploader, } - -class FacebookPostIE(InfoExtractor): - IE_NAME = 'facebook:post' - _VALID_URL = r'https?://(?:\w+\.)?facebook\.com/[^/]+/posts/(?P\d+)' - _TEST = { - 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', - 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', - 'info_dict': { - 'id': '544765982287235', - 'ext': 'mp4', - 'title': '"What are you doing running in the snow?"', - 'uploader': 'FailArmy', - } - } + return webpage, info_dict def _real_extract(self, url): - post_id = self._match_id(url) + video_id = self._match_id(url) - webpage = self._download_webpage(url, post_id) + real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url + webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False) - entries = [ - self.url_result('facebook:%s' % video_id, FacebookIE.ie_key()) - for video_id in self._parse_json( - self._search_regex( - r'(["\'])video_ids\1\s*:\s*(?P\[.+?\])', - webpage, 'video ids', group='ids'), - post_id)] + if info_dict: + return info_dict - return self.playlist_result(entries, post_id) + if '/posts/' in url: + entries = [ + self.url_result('facebook:%s' % vid, FacebookIE.ie_key()) + for vid in self._parse_json( + self._search_regex( + r'(["\'])video_ids\1\s*:\s*(?P\[.+?\])', + webpage, 'video ids', group='ids'), + video_id)] + + return self.playlist_result(entries, video_id) + else: + _, info_dict = self._extract_from_url( + self._VIDEO_PAGE_TEMPLATE % video_id, + video_id, fatal_if_no_video=True) + return info_dict diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 9580f5c0c..508684d2e 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -17,7 +17,7 @@ from ..utils import ( class FC2IE(InfoExtractor): - _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' + _VALID_URL = r'^https?://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py index 298227d57..e8936cb24 100644 --- a/youtube_dl/extractor/firstpost.py +++ b/youtube_dl/extractor/firstpost.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class FirstpostIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' _TEST = { 'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html', diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 510d4b108..98b165143 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -8,7 +8,7 @@ from ..utils import int_or_none class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'http://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P.+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P.+)' _TESTS = [{ 'url': 'http://www.1tv.ru/videoarchive/73390', diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 2955965d9..67d50a386 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_urllib_parse, @@ -16,12 +18,7 @@ from ..utils import ( class FiveMinIE(InfoExtractor): IE_NAME = '5min' - _VALID_URL = r'''(?x) - (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=| - https?://(?:(?:massively|www)\.)?joystiq\.com/video/| - 5min:) - (?P\d+) - ''' + _VALID_URL = r'(?:5min:(?P\d+)(?::(?P\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P.*))' _TESTS = [ { @@ -45,6 +42,7 @@ class FiveMinIE(InfoExtractor): 'title': 'How to Make a Next-Level Fruit Salad', 'duration': 184, }, + 'skip': 'no longer available', }, ] _ERRORS = { @@ -91,20 +89,33 @@ class FiveMinIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + sid = mobj.group('sid') + + if mobj.group('query'): + qs = compat_parse_qs(mobj.group('query')) + if not qs.get('playList'): + raise ExtractorError('Invalid URL', expected=True) + video_id = qs['playList'][0] + if qs.get('sid'): + sid = qs['sid'][0] + embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed page') - sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') - query = compat_urllib_parse.urlencode({ - 'func': 'GetResults', - 'playlist': video_id, - 'sid': sid, - 'isPlayerSeed': 'true', - 'url': embed_url, - }) + if not sid: + embed_page = self._download_webpage(embed_url, video_id, + 'Downloading embed page') + sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') + response = self._download_json( - 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, + 'https://syn.5min.com/handlers/SenseHandler.ashx?' + + compat_urllib_parse.urlencode({ + 'func': 'GetResults', + 'playlist': video_id, + 'sid': sid, + 'isPlayerSeed': 'true', + 'url': embed_url, + }), video_id) if not response['success']: raise ExtractorError( @@ -118,9 +129,7 @@ class FiveMinIE(InfoExtractor): parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) for rendition in info['Renditions']: - if rendition['RenditionType'] == 'm3u8': - formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls')) - elif rendition['RenditionType'] == 'aac': + if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8': continue else: rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 5f6e65dae..a3a291599 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -10,7 +10,7 @@ from ..utils import ( class FKTVIE(InfoExtractor): IE_NAME = 'fernsehkritik.tv' - _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P[0-9]+)(?:/.*)?' + _VALID_URL = r'https?://(?:www\.)?fernsehkritik\.tv/folge-(?P[0-9]+)(?:/.*)?' _TEST = { 'url': 'http://fernsehkritik.tv/folge-1', diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py index 370fd006f..d2503ae2e 100644 --- a/youtube_dl/extractor/footyroom.py +++ b/youtube_dl/extractor/footyroom.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class FootyRoomIE(InfoExtractor): - _VALID_URL = r'http://footyroom\.com/(?P[^/]+)' + _VALID_URL = r'https?://footyroom\.com/(?P[^/]+)' _TESTS = [{ 'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/', 'info_dict': { diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index 08b8ea362..70c1a815d 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class FoxgayIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' + _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' _TEST = { 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml', 'md5': '80d72beab5d04e1655a56ad37afe6841', diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 318ac013d..1dc50318c 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -36,6 +36,10 @@ class FoxNewsIE(AMPIE): # 'upload_date': '20141204', 'thumbnail': 're:^https?://.*\.jpg$', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 0388ba00c..2369f868d 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class FranceInterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P[0-9]+)' _TEST = { 'url': 'http://www.franceinter.fr/player/reecouter?play=793962', 'md5': '4764932e466e6f6c79c317d2e74f6884', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 3f4ac3093..ad94e31f3 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -60,28 +60,31 @@ class FranceTVBaseInfoExtractor(InfoExtractor): video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: formats.extend(self._extract_f4m_formats( - f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id)) + f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', + video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id)) + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, 'format_id': 'rtmp-%s' % format_id, 'ext': 'flv', - 'preference': 1, }) else: - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'preference': -1, - }) + if self._is_valid_url(video_url, video_id, format_id): + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) self._sort_formats(formats) title = info['titre'] subtitle = info.get('sous_titre') if subtitle: title += ' - %s' % subtitle + title = title.strip() subtitles = {} subtitles_list = [{ @@ -125,13 +128,13 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile)\.francetvinfo\.fr/.*/(?P.+)\.html' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P<title>.+)\.html' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', 'info_dict': { 'id': '84981923', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Soir 3', 'upload_date': '20130826', 'timestamp': 1377548400, @@ -139,6 +142,10 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'fr': 'mincount:2', }, }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', 'info_dict': { @@ -155,11 +162,32 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', 'md5': 'f485bda6e185e7d15dbc69b72bae993e', 'info_dict': { - 'id': '556e03339473995ee145930c', + 'id': 'NI_173343', 'ext': 'mp4', 'title': 'Les entreprises familiales : le secret de la réussite', 'thumbnail': 're:^https?://.*\.jpe?g$', - } + 'timestamp': 1433273139, + 'upload_date': '20150602', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, { + 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html', + 'md5': 'f485bda6e185e7d15dbc69b72bae993e', + 'info_dict': { + 'id': 'NI_657393', + 'ext': 'mp4', + 'title': 'Olivier Monthus, réalisateur de "Bretagne, le choix de l’Armor"', + 'description': 'md5:a3264114c9d29aeca11ced113c37b16c', + 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1458300695, + 'upload_date': '20160318', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -172,7 +200,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self.url_result(dmcloud_url, 'DailymotionCloud') video_id, catalogue = self._search_regex( - r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@') + (r'id-video=([^@]+@[^"]+)', + r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + webpage, 'video id').split('@') return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py index c210177f7..1477708bb 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/youtube_dl/extractor/freespeech.py @@ -14,7 +14,7 @@ class FreespeechIE(InfoExtractor): 'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0', 'info_dict': { 'id': 'poKsVCZ64uU', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Obama, Romney Campaign in Colorado Ahead of Debate', 'description': 'Obama, Romney Campaign in Colorado Ahead of Debate', 'uploader': 'freespeechtv', diff --git a/youtube_dl/extractor/freevideo.py b/youtube_dl/extractor/freevideo.py index c7bec027b..cd8423a6f 100644 --- a/youtube_dl/extractor/freevideo.py +++ b/youtube_dl/extractor/freevideo.py @@ -5,7 +5,7 @@ from ..utils import ExtractorError class FreeVideoIE(InfoExtractor): - _VALID_URL = r'^http://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' + _VALID_URL = r'^https?://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' _TEST = { 'url': 'http://www.freevideo.cz/vase-videa/vysukany-zadecek-22033.html', diff --git a/youtube_dl/extractor/gameinformer.py b/youtube_dl/extractor/gameinformer.py index 25870c131..a66e309de 100644 --- a/youtube_dl/extractor/gameinformer.py +++ b/youtube_dl/extractor/gameinformer.py @@ -2,42 +2,27 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str -from ..utils import int_or_none class GameInformerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gameinformer\.com/(?:[^/]+/)*(?P<id>.+)\.aspx' _TEST = { 'url': 'http://www.gameinformer.com/b/features/archive/2015/09/26/replay-animal-crossing.aspx', + 'md5': '292f26da1ab4beb4c9099f1304d2b071', 'info_dict': { 'id': '4515472681001', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': 'Replay - Animal Crossing', 'description': 'md5:2e211891b215c85d061adc7a4dd2d930', - 'timestamp': 1443457610706, - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'timestamp': 1443457610, + 'upload_date': '20150928', + 'uploader_id': '694940074001', }, } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/694940074001/default_default/index.html?videoId=%s' def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - - bc_api_url = self._search_regex(r"getVideo\('([^']+)'", webpage, 'brightcove api url') - json_data = self._download_json( - bc_api_url + '&video_fields=id,name,shortDescription,publishedDate,videoStillURL,length,IOSRenditions', - display_id) - - return { - 'id': compat_str(json_data['id']), - 'display_id': display_id, - 'url': json_data['IOSRenditions'][0]['url'], - 'title': json_data['name'], - 'description': json_data.get('shortDescription'), - 'timestamp': int_or_none(json_data.get('publishedDate')), - 'duration': int_or_none(json_data.get('length')), - } + brightcove_id = self._search_regex(r"getVideo\('[^']+video_id=(\d+)", webpage, 'brightcove id') + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index f6b9046f9..cbcddcb7c 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -10,7 +10,7 @@ from .youtube import YoutubeIE class GamekingsIE(InfoExtractor): - _VALID_URL = r'http://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' _TESTS = [{ # YouTube embed video 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index b3f1bafcc..4ffdd7515 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index 590ccf526..69058a583 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -13,7 +13,7 @@ from ..utils import ( class GameStarIE(InfoExtractor): - _VALID_URL = r'http://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' _TEST = { 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', 'md5': '96974ecbb7fd8d0d20fca5a00810cea7', diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index c3f031d9c..1e7948ab8 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -9,7 +9,7 @@ from ..utils import ( class GametrailersIE(InfoExtractor): - _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' + _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' _TEST = { 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca745ae41..12f2309fc 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -59,6 +59,7 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .instagram import InstagramIE class GenericIE(InfoExtractor): @@ -239,6 +240,35 @@ class GenericIE(InfoExtractor): 'format': 'bestvideo', }, }, + # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 + { + 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', + 'info_dict': { + 'id': 'content', + 'ext': 'mp4', + 'title': 'content', + 'formats': 'mincount:8', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, + # m3u8 served with Content-Type: text/plain + { + 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', + 'info_dict': { + 'id': 'index', + 'ext': 'mp4', + 'title': 'index', + 'upload_date': '20140720', + 'formats': 'mincount:11', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', @@ -1242,28 +1272,30 @@ class GenericIE(InfoExtractor): full_response = self._request_webpage(request, video_id) head_response = full_response + info_dict = { + 'id': video_id, + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) + } + # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '') - m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type) + content_type = head_response.headers.get('Content-Type', '').lower() + m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) - formats = [] - if m.group('format_id').endswith('mpegurl'): + format_id = m.group('format_id') + if format_id.endswith('mpegurl'): formats = self._extract_m3u8_formats(url, video_id, 'mp4') + elif format_id == 'f4m': + formats = self._extract_f4m_formats(url, video_id) else: formats = [{ 'format_id': m.group('format_id'), 'url': url, 'vcodec': 'none' if m.group('type') == 'audio' else None }] - return { - 'id': video_id, - 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), - 'direct': True, - 'formats': formats, - 'upload_date': upload_date, - } + info_dict['direct'] = True + info_dict['formats'] = formats + return info_dict if not self._downloader.params.get('test', False) and not is_intentional: force = self._downloader.params.get('force_generic_extractor', False) @@ -1283,21 +1315,23 @@ class GenericIE(InfoExtractor): request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) + first_bytes = full_response.read(512) + + # Is it an M3U playlist? + if first_bytes.startswith(b'#EXTM3U'): + info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') + return info_dict + # Maybe it's a direct link to a video? # Be careful not to download the whole thing! - first_bytes = full_response.read(512) if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) - return { - 'id': video_id, - 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + info_dict.update({ 'direct': True, 'url': url, - 'upload_date': upload_date, - } + }) + return info_dict webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) @@ -1314,12 +1348,12 @@ class GenericIE(InfoExtractor): elif doc.tag == '{http://xspf.org/ns/0/}playlist': return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): - return { - 'id': video_id, - 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), - 'formats': self._parse_mpd_formats( - doc, video_id, mpd_base_url=url.rpartition('/')[0]), - } + info_dict['formats'] = self._parse_mpd_formats( + doc, video_id, mpd_base_url=url.rpartition('/')[0]) + return info_dict + elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): + info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + return info_dict except compat_xml_parse_error: pass @@ -1876,6 +1910,19 @@ class GenericIE(InfoExtractor): self._proto_relative_url(unescapeHTML(mobj.group(1))), 'AdobeTVVideo') + # Look for Vine embeds + mobj = re.search( + r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + + # Look for Instagram embeds + instagram_embed_url = InstagramIE._extract_embed_url(webpage) + if instagram_embed_url is not None: + return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -1985,6 +2032,8 @@ class GenericIE(InfoExtractor): entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4') elif ext == 'mpd': entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) + elif ext == 'f4m': + entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) else: entry_info_dict['url'] = video_url diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 37be34091..766fc26d0 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -10,8 +10,8 @@ from ..utils import ( class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})' - _TEST = { + _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' + _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'md5': '881f7700aec4f538571fa1e0eed4a7b6', 'info_dict': { @@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor): 'title': 'Big Buck Bunny.mp4', 'duration': 46, } - } + }, { + # video id is longer than 28 characters + 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', + 'only_matching': True, + }] _FORMATS_EXT = { '5': 'flv', '6': 'flv', @@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', + r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})', webpage) if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py new file mode 100644 index 000000000..dad0f3994 --- /dev/null +++ b/youtube_dl/extractor/hbo.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + xpath_element, + int_or_none, + parse_duration, +) + + +class HBOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', + 'md5': '1c33253f0c7782142c993c0ba62a8753', + 'info_dict': { + 'id': '1437839', + 'ext': 'mp4', + 'title': 'Ep. 64 Clip: Encryption', + } + } + _FORMATS_INFO = { + '1920': { + 'width': 1280, + 'height': 720, + }, + '640': { + 'width': 768, + 'height': 432, + }, + 'highwifi': { + 'width': 640, + 'height': 360, + }, + 'high3g': { + 'width': 640, + 'height': 360, + }, + 'medwifi': { + 'width': 400, + 'height': 224, + }, + 'med3g': { + 'width': 400, + 'height': 224, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_xml( + 'http://render.lv3.hbo.com/data/content/global/videos/data/%s.xml' % video_id, video_id) + title = xpath_text(video_data, 'title', 'title', True) + + formats = [] + for source in xpath_element(video_data, 'videos', 'sources', True): + if source.tag == 'size': + path = xpath_text(source, './/path') + if not path: + continue + width = source.attrib.get('width') + format_info = self._FORMATS_INFO.get(width, {}) + height = format_info.get('height') + fmt = { + 'url': path, + 'format_id': 'http%s' % ('-%dp' % height if height else ''), + 'width': format_info.get('width'), + 'height': height, + } + rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', path) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': fmt['format_id'].replace('http', 'rtmp'), + }) + formats.append(fmt) + else: + video_url = source.text + if not video_url: + continue + if source.tag == 'tarball': + formats.extend(self._extract_m3u8_formats( + video_url.replace('.tar', '/base_index_w8.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + format_info = self._FORMATS_INFO.get(source.tag, {}) + formats.append({ + 'format_id': 'http-%s' % source.tag, + 'url': video_url, + 'width': format_info.get('width'), + 'height': format_info.get('height'), + }) + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + + thumbnails = [] + card_sizes = xpath_element(video_data, 'titleCardSizes') + if card_sizes is not None: + for size in card_sizes: + path = xpath_text(size, 'path') + if not path: + continue + width = int_or_none(size.get('width')) + thumbnails.append({ + 'id': width, + 'url': path, + 'width': width, + }) + + return { + 'id': video_id, + 'title': title, + 'duration': parse_duration(xpath_element(video_data, 'duration/tv14')), + 'formats': formats, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 31e219945..efc3e8429 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -12,7 +12,7 @@ from ..utils import ( class HotNewHipHopIE(InfoExtractor): - _VALID_URL = r'http://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html' + _VALID_URL = r'https?://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html' _TEST = { 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', 'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96', diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index b3706fe6d..e0ab31802 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -12,7 +12,7 @@ from ..utils import ( class HypemIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/' _TEST = { 'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME', 'md5': 'b9cc91b5af8995e9f0c1cee04c575828', diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index b61b2dc4e..8bed8ccd0 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -12,7 +12,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' _TEST = { 'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -70,7 +70,7 @@ class ImdbIE(InfoExtractor): class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' _TEST = { 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', 'info_dict': { diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 12fb5e8e1..9622f198a 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -73,7 +73,7 @@ class IndavideoEmbedIE(InfoExtractor): 'url': self._proto_relative_url(thumbnail) } for thumbnail in video.get('thumbnails', [])] - tags = [tag['title'] for tag in video.get('tags', [])] + tags = [tag['title'] for tag in video.get('tags') or []] return { 'id': video.get('id') or video_id, diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index ed3e07118..4e62098b0 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -4,8 +4,10 @@ import re from .common import InfoExtractor from ..utils import ( + get_element_by_attribute, int_or_none, limit_length, + lowercase_escape, ) @@ -38,6 +40,18 @@ class InstagramIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_embed_url(webpage): + blockquote_el = get_element_by_attribute( + 'class', 'instagram-media', webpage) + if blockquote_el is None: + return + + mobj = re.search( + r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) + if mobj: + return mobj.group('link') + def _real_extract(self, url): video_id = self._match_id(url) @@ -46,6 +60,8 @@ class InstagramIE(InfoExtractor): webpage, 'uploader id', fatal=False) desc = self._search_regex( r'"caption":"(.+?)"', webpage, 'description', default=None) + if desc is not None: + desc = lowercase_escape(desc) return { 'id': video_id, diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 61a0de472..788bbe0d5 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -6,6 +6,8 @@ import time from .common import InfoExtractor from ..utils import ( + determine_ext, + js_to_json, sanitized_Request, ) @@ -30,8 +32,7 @@ class IPrimaIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -43,9 +44,42 @@ class IPrimaIE(InfoExtractor): req.add_header('Referer', url) playerpage = self._download_webpage(req, video_id, note='Downloading player') - m3u8_url = self._search_regex(r"'src': '([^']+\.m3u8)'", playerpage, 'm3u8 url') + formats = [] - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + def extract_formats(format_url, format_key=None, lang=None): + ext = determine_ext(format_url) + new_formats = [] + if format_key == 'hls' or ext == 'm3u8': + new_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + elif format_key == 'dash' or ext == 'mpd': + return + new_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) + if lang: + for f in new_formats: + if not f.get('language'): + f['language'] = lang + formats.extend(new_formats) + + options = self._parse_json( + self._search_regex( + r'(?s)var\s+playerOptions\s*=\s*({.+?});', + playerpage, 'player options', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if options: + for key, tracks in options.get('tracks', {}).items(): + if not isinstance(tracks, list): + continue + for track in tracks: + src = track.get('src') + if src: + extract_formats(src, key.lower(), track.get('lang')) + + if not formats: + for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage): + extract_formats(src) self._sort_formats(formats) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index d3bee3a19..ffcea30ad 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -165,7 +165,7 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html' + _VALID_URL = r'https?://(?:[^.]+\.)?iqiyi\.com/.+\.html' _NETRC_MACHINE = 'iqiyi' @@ -501,7 +501,7 @@ class IqiyiIE(InfoExtractor): def get_enc_key(self, video_id): # TODO: automatic key extraction # last update at 2016-01-22 for Zombie::bite - enc_key = '6ab6d0280511493ba85594779759d4ed' + enc_key = '4a1caba4b4465345366f28da7c117d20' return enc_key def _extract_playlist(self, webpage): diff --git a/youtube_dl/extractor/jadorecettepub.py b/youtube_dl/extractor/jadorecettepub.py index 063e86de4..158c09a33 100644 --- a/youtube_dl/extractor/jadorecettepub.py +++ b/youtube_dl/extractor/jadorecettepub.py @@ -9,7 +9,7 @@ from .youtube import YoutubeIE class JadoreCettePubIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html' + _VALID_URL = r'https?://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html' _TEST = { 'url': 'http://www.jadorecettepub.com/2010/12/star-wars-massacre-par-les-japonais.html', diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index eef7daa29..1a4227f6b 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class JeuxVideoIE(InfoExtractor): - _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' + _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' _TESTS = [{ 'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', @@ -30,7 +30,7 @@ class JeuxVideoIE(InfoExtractor): webpage = self._download_webpage(url, title) title = self._html_search_meta('name', webpage) or self._og_search_title(webpage) config_url = self._html_search_regex( - r'data-src="(/contenu/medias/video.php.*?)"', + r'data-src(?:set-video)?="(/contenu/medias/video.php.*?)"', webpage, 'config URL') config_url = 'http://www.jeuxvideo.com' + config_url diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index ccbc39c66..44d7c84a1 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -8,6 +8,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urlparse, + compat_parse_qs, ) from ..utils import ( clean_html, @@ -20,21 +21,17 @@ from ..utils import ( class KalturaIE(InfoExtractor): _VALID_URL = r'''(?x) (?: - kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)| + kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)| https?:// (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/ (?: (?: # flash player - index\.php/kwidget/ - (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/ - (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)| + index\.php/kwidget| # html5 player - html5/html5lib/ - (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+) - .*\?.*\bwid=_(?P<partner_id_html5>\d+) + html5/html5lib/[^/]+/mwEmbedFrame\.php ) - ) + )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))? ) ''' _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?' @@ -127,10 +124,41 @@ class KalturaIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5') - entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5') - - info, flavor_assets = self._get_video_info(entry_id, partner_id) + partner_id, entry_id = mobj.group('partner_id', 'id') + ks = None + if partner_id and entry_id: + info, flavor_assets = self._get_video_info(entry_id, partner_id) + else: + path, query = mobj.group('path', 'query') + if not path and not query: + raise ExtractorError('Invalid URL', expected=True) + params = {} + if query: + params = compat_parse_qs(query) + if path: + splitted_path = path.split('/') + params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]])))) + if 'wid' in params: + partner_id = params['wid'][0][1:] + elif 'p' in params: + partner_id = params['p'][0] + else: + raise ExtractorError('Invalid URL', expected=True) + if 'entry_id' in params: + entry_id = params['entry_id'][0] + info, flavor_assets = self._get_video_info(entry_id, partner_id) + elif 'uiconf_id' in params and 'flashvars[referenceId]' in params: + reference_id = params['flashvars[referenceId]'][0] + webpage = self._download_webpage(url, reference_id) + entry_data = self._parse_json(self._search_regex( + r'window\.kalturaIframePackageData\s*=\s*({.*});', + webpage, 'kalturaIframePackageData'), + reference_id)['entryResult'] + info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] + entry_id = info['id'] + else: + raise ExtractorError('Invalid URL', expected=True) + ks = params.get('flashvars[ks]', [None])[0] source_url = smuggled_data.get('source_url') if source_url: @@ -140,14 +168,19 @@ class KalturaIE(InfoExtractor): else: referrer = None + def sign_url(unsigned_url): + if ks: + unsigned_url += '/ks/%s' % ks + if referrer: + unsigned_url += '?referrer=%s' % referrer + return unsigned_url + formats = [] for f in flavor_assets: # Continue if asset is not ready if f['status'] != 2: continue - video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id']) - if referrer: - video_url += '?referrer=%s' % referrer + video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id'])) formats.append({ 'format_id': '%(fileExt)s-%(bitrate)s' % f, 'ext': f.get('fileExt'), @@ -160,9 +193,7 @@ class KalturaIE(InfoExtractor): 'width': int_or_none(f.get('width')), 'url': video_url, }) - m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp') - if referrer: - m3u8_url += '?referrer=%s' % referrer + m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp')) formats.extend(self._extract_m3u8_formats( m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index 06daf5a89..b4c30b7f3 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -9,7 +9,7 @@ from ..utils import ( class KaraoketvIE(InfoExtractor): - _VALID_URL = r'http://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)' _TEST = { 'url': 'http://karaoketv.co.il/?container=songs&id=171568', 'info_dict': { diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index bed94bc93..2cb04e533 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -12,7 +12,7 @@ from ..utils import ( class KarriereVideosIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', 'info_dict': { diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py index 08a671fa8..61739efa7 100644 --- a/youtube_dl/extractor/khanacademy.py +++ b/youtube_dl/extractor/khanacademy.py @@ -14,10 +14,10 @@ class KhanAcademyIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.khanacademy.org/video/one-time-pad', - 'md5': '7021db7f2d47d4fff89b13177cb1e8f4', + 'md5': '7b391cce85e758fb94f763ddc1bbb979', 'info_dict': { 'id': 'one-time-pad', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'The one-time pad', 'description': 'The perfect cipher', 'duration': 176, diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index a59c529f4..704bd7b34 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -13,7 +13,7 @@ from ..utils import ( class KontrTubeIE(InfoExtractor): IE_NAME = 'kontrtube' IE_DESC = 'KontrTube.ru - Труба зовёт' - _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/' _TEST = { 'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/', diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py index a602980a1..a574408e5 100644 --- a/youtube_dl/extractor/ku6.py +++ b/youtube_dl/extractor/ku6.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class Ku6IE(InfoExtractor): - _VALID_URL = r'http://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html' + _VALID_URL = r'https?://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html' _TEST = { 'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html', 'md5': '01203549b9efbb45f4b87d55bdea1ed1', diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py new file mode 100644 index 000000000..12cc56e44 --- /dev/null +++ b/youtube_dl/extractor/kusi.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote_plus +from ..utils import ( + int_or_none, + float_or_none, + timeconvert, + update_url_query, + xpath_text, +) + + +class KUSIIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' + _TESTS = [{ + 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', + 'md5': 'f926e7684294cf8cb7bdf8858e1b3988', + 'info_dict': { + 'id': '12203019', + 'ext': 'mp4', + 'title': 'Turko Files: Case Closed! & Put On Hold!', + 'duration': 231.0, + 'upload_date': '20160210', + 'timestamp': 1455087571, + 'thumbnail': 're:^https?://.*\.jpg$' + }, + }, { + 'url': 'http://kusi.com/video?clipId=12203019', + 'info_dict': { + 'id': '12203019', + 'ext': 'mp4', + 'title': 'Turko Files: Case Closed! & Put On Hold!', + 'duration': 231.0, + 'upload_date': '20160210', + 'timestamp': 1455087571, + 'thumbnail': 're:^https?://.*\.jpg$' + }, + 'params': { + 'skip_download': True, # Same as previous one + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + clip_id = mobj.group('clipId') + video_id = clip_id or mobj.group('path') + + webpage = self._download_webpage(url, video_id) + + if clip_id is None: + video_id = clip_id = self._html_search_regex( + r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id') + + affiliate_id = self._search_regex( + r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id') + + # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf + xml_url = update_url_query('http://www.kusi.com/build.asp', { + 'buildtype': 'buildfeaturexmlrequest', + 'featureType': 'Clip', + 'featureid': clip_id, + 'affiliateno': affiliate_id, + 'clientgroupid': '1', + 'rnd': int(round(random.random() * 1000000)), + }) + + doc = self._download_xml(xml_url, video_id) + + video_title = xpath_text(doc, 'HEADLINE', fatal=True) + duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) + description = xpath_text(doc, 'ABSTRACT') + thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') + createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) + + quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') + formats = [] + for quality in quality_options: + formats.append({ + 'url': compat_urllib_parse_unquote_plus(quality.attrib['url']), + 'height': int_or_none(quality.attrib.get('height')), + 'width': int_or_none(quality.attrib.get('width')), + 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'description': description, + 'duration': duration, + 'formats': formats, + 'thumbnail': thumbnail, + 'timestamp': createtion_time, + } diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 700e44b63..a586308b2 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -import itertools from .common import InfoExtractor from ..utils import ( get_element_by_id, clean_html, ExtractorError, + InAdvancePagedList, remove_start, ) @@ -23,7 +23,7 @@ class KuwoBaseIE(InfoExtractor): {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} ] - def _get_formats(self, song_id): + def _get_formats(self, song_id, tolerate_ip_deny=False): formats = [] for file_format in self._FORMATS: song_url = self._download_webpage( @@ -32,7 +32,7 @@ class KuwoBaseIE(InfoExtractor): song_id, note='Download %s url info' % file_format['format'], ) - if song_url == 'IPDeny': + if song_url == 'IPDeny' and not tolerate_ip_deny: raise ExtractorError('This song is blocked in this region', expected=True) if song_url.startswith('http://') or song_url.startswith('https://'): @@ -43,14 +43,19 @@ class KuwoBaseIE(InfoExtractor): 'preference': file_format['preference'], 'abr': file_format.get('abr'), }) - self._sort_formats(formats) + + # XXX _sort_formats fails if there are not formats, while it's not the + # desired behavior if 'IPDeny' is ignored + # This check can be removed if https://github.com/rg3/youtube-dl/pull/8051 is merged + if not tolerate_ip_deny: + self._sort_formats(formats) return formats class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' - _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/yinyue/(?P<id>\d+?)' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { @@ -75,6 +80,9 @@ class KuwoIE(KuwoBaseIE): 'params': { 'format': 'mp3-320' }, + }, { + 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', + 'only_matching': True, }] def _real_extract(self, url): @@ -126,7 +134,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' IE_DESC = '酷我音乐 - 专辑' - _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/album/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', 'info_dict': { @@ -162,13 +170,11 @@ class KuwoAlbumIE(InfoExtractor): class KuwoChartIE(InfoExtractor): IE_NAME = 'kuwo:chart' IE_DESC = '酷我音乐 - 排行榜' - _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 'info_dict': { 'id': '香港中文龙虎榜', - 'title': '香港中文龙虎榜', - 'description': 're:\d{4}第\d{2}期', }, 'playlist_mincount': 10, } @@ -179,30 +185,24 @@ class KuwoChartIE(InfoExtractor): url, chart_id, note='Download chart info', errnote='Unable to get chart info') - chart_name = self._html_search_regex( - r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name') - - chart_desc = self._html_search_regex( - r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc') - entries = [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage) + r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) ] - return self.playlist_result(entries, chart_id, chart_name, chart_desc) + return self.playlist_result(entries, chart_id) class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' IE_DESC = '酷我音乐 - 歌手' - _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 'info_dict': { 'id': 'bruno+mars', 'title': 'Bruno Mars', }, - 'playlist_count': 10, + 'playlist_mincount': 329, }, { 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', 'info_dict': { @@ -213,6 +213,8 @@ class KuwoSingerIE(InfoExtractor): 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540 }] + PAGE_SIZE = 15 + def _real_extract(self, url): singer_id = self._match_id(url) webpage = self._download_webpage( @@ -220,25 +222,28 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get singer info') singer_name = self._html_search_regex( - r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name' - ) + r'<h1>([^<]+)</h1>', webpage, 'singer name') - entries = [] - first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True - for page_num in itertools.count(1): + artist_id = self._html_search_regex( + r'data-artistid="(\d+)"', webpage, 'artist id') + + page_count = int(self._html_search_regex( + r'data-page="(\d+)"', webpage, 'page count')) + + def page_func(page_num): webpage = self._download_webpage( - 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), - singer_id, note='Download song list page #%d' % page_num, - errnote='Unable to get song list page #%d' % page_num) + 'http://www.kuwo.cn/artist/contentMusicsAjax', + singer_id, note='Download song list page #%d' % (page_num + 1), + errnote='Unable to get song list page #%d' % (page_num + 1), + query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) - entries.extend([ + return [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<p[^>]+class="m_name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/', + r'<div[^>]+class="name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) - ][:10 if first_page_only else None]) + ] - if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage): - break + entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE) return self.playlist_result(entries, singer_id, singer_name) @@ -246,7 +251,7 @@ class KuwoSingerIE(InfoExtractor): class KuwoCategoryIE(InfoExtractor): IE_NAME = 'kuwo:category' IE_DESC = '酷我音乐 - 分类' - _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', 'info_dict': { @@ -283,15 +288,21 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' IE_DESC = '酷我音乐 - MV' - _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/mv/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { 'id': '6480076', - 'ext': 'mkv', - 'title': '我们家MV', + 'ext': 'mp4', + 'title': 'My HouseMV', 'creator': '2PM', }, + # In this video, music URLs (anti.s) are blocked outside China and + # USA, while the MV URL (mvurl) is available globally, so force the MV + # URL for consistent results in different countries + 'params': { + 'format': 'mv', + }, } _FORMATS = KuwoBaseIE._FORMATS + [ {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, @@ -313,7 +324,17 @@ class KuwoMvIE(KuwoBaseIE): else: raise ExtractorError('Unable to find song or singer names') - formats = self._get_formats(song_id) + formats = self._get_formats(song_id, tolerate_ip_deny=True) + + mv_url = self._download_webpage( + 'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id, + song_id, note='Download %s MV URL' % song_id) + formats.append({ + 'url': mv_url, + 'format_id': 'mv', + }) + + self._sort_formats(formats) return { 'id': song_id, diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 5d8ebbeb3..41d80bc12 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -19,7 +19,7 @@ from ..utils import ( class Laola1TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/[^/]+/(?P<slug>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/(?P<kind>[^/]+)/(?P<slug>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -33,7 +33,7 @@ class Laola1TvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie', 'info_dict': { @@ -47,12 +47,28 @@ class Laola1TvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + }, { + 'url': 'http://www.laola1.tv/de-de/livestream/2016-03-22-belogorie-belgorod-trentino-diatec-lde', + 'info_dict': { + 'id': '487850', + 'display_id': '2016-03-22-belogorie-belgorod-trentino-diatec-lde', + 'ext': 'flv', + 'title': 'Belogorie BELGOROD - TRENTINO Diatec', + 'upload_date': '20160322', + 'uploader': 'CEV - Europäischer Volleyball Verband', + 'is_live': True, + 'categories': ['Volleyball'], + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('slug') + kind = mobj.group('kind') lang = mobj.group('lang') portal = mobj.group('portal') @@ -85,12 +101,17 @@ class Laola1TvIE(InfoExtractor): _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) title = _v('title', fatal=True) + VS_TARGETS = { + 'video': '2', + 'livestream': '17', + } + req = sanitized_Request( 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access?%s' % compat_urllib_parse.urlencode({ 'videoId': video_id, - 'target': '2', - 'label': 'laola1tv', + 'target': VS_TARGETS.get(kind, '2'), + 'label': _v('label'), 'area': _v('area'), }), urlencode_postdata( diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index d0cd3f591..462b752dd 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -28,7 +28,7 @@ from ..utils import ( class LeIE(InfoExtractor): IE_DESC = '乐视网' - _VALID_URL = r'http://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html' + _VALID_URL = r'https?://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html' _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' @@ -196,7 +196,7 @@ class LeIE(InfoExtractor): class LePlaylistIE(InfoExtractor): - _VALID_URL = r'http://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)' _TESTS = [{ 'url': 'http://www.le.com/tv/46177.html', @@ -217,14 +217,8 @@ class LePlaylistIE(InfoExtractor): 'playlist_mincount': 96 }, { 'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml', - 'info_dict': { - 'id': 'lswjzzjc', - # The title should be "劲舞青春", but I can't find a simple way to - # determine the playlist title - 'title': '乐视午间自制剧场', - 'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489' - }, - 'playlist_mincount': 7 + # This series is moved to http://www.le.com/tv/10005297.html + 'only_matching': True, }, { 'url': 'http://www.le.com/comic/92063.html', 'only_matching': True, @@ -338,7 +332,7 @@ class LetvCloudIE(InfoExtractor): formats.append({ 'url': url, 'ext': determine_ext(decoded_url), - 'format_id': int_or_none(play_url.get('vtype')), + 'format_id': str_or_none(play_url.get('vtype')), 'format_note': str_or_none(play_url.get('definition')), 'width': int_or_none(play_url.get('vwidth')), 'height': int_or_none(play_url.get('vheight')), diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index a8fd639cc..ba2f80a75 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -17,7 +17,7 @@ from ..utils import ( class LifeNewsIE(InfoExtractor): IE_NAME = 'lifenews' IE_DESC = 'LIFE | NEWS' - _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' + _VALID_URL = r'https?://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' _TESTS = [{ # single video embedded via video/source @@ -159,7 +159,7 @@ class LifeNewsIE(InfoExtractor): class LifeEmbedIE(InfoExtractor): IE_NAME = 'life:embed' - _VALID_URL = r'http://embed\.life\.ru/embed/(?P<id>[\da-f]{32})' + _VALID_URL = r'https?://embed\.life\.ru/embed/(?P<id>[\da-f]{32})' _TEST = { 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 1a0625ac3..2599d45c3 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -123,7 +123,7 @@ class LimelightBaseIE(InfoExtractor): class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' - _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:media:|https?://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})' _TESTS = [{ 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', 'info_dict': { @@ -176,7 +176,7 @@ class LimelightMediaIE(LimelightBaseIE): class LimelightChannelIE(LimelightBaseIE): IE_NAME = 'limelight:channel' - _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:channel:|https?://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', 'info_dict': { @@ -207,7 +207,7 @@ class LimelightChannelIE(LimelightBaseIE): class LimelightChannelListIE(LimelightBaseIE): IE_NAME = 'limelight:channel_list' - _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:channel_list:|https?://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', 'info_dict': { diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 38fb3d9e4..eada7c299 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -14,6 +14,7 @@ from ..utils import ( xpath_with_ns, xpath_text, orderedSet, + update_url_query, int_or_none, float_or_none, parse_iso8601, @@ -64,7 +65,7 @@ class LivestreamIE(InfoExtractor): def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base_ele = find_xpath_attr( smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') - base = base_ele.get('content') if base_ele else 'http://livestreamvod-f.akamaihd.net/' + base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/' formats = [] video_nodes = smil.findall(self._xpath_ns('.//video', namespace)) @@ -72,7 +73,10 @@ class LivestreamIE(InfoExtractor): for vn in video_nodes: tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000) furl = ( - '%s%s?v=3.0.3&fp=WIN%%2014,0,0,145' % (base, vn.attrib['src'])) + update_url_query(compat_urlparse.urljoin(base, vn.attrib['src']), { + 'v': '3.0.3', + 'fp': 'WIN% 14,0,0,145', + })) if 'clipBegin' in vn.attrib: furl += '&ssek=' + vn.attrib['clipBegin'] formats.append({ diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py index 7e025831b..d5945ad66 100644 --- a/youtube_dl/extractor/m6.py +++ b/youtube_dl/extractor/m6.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class M6IE(InfoExtractor): IE_NAME = 'm6' - _VALID_URL = r'http://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html' + _VALID_URL = r'https?://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html' _TEST = { 'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html', diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 71085f279..46eb00492 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -13,7 +13,7 @@ from ..utils import ( class MailRuIE(InfoExtractor): IE_NAME = 'mailru' IE_DESC = 'Видео@Mail.Ru' - _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' + _VALID_URL = r'https?://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' _TESTS = [ { diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/makerschannel.py new file mode 100644 index 000000000..f5d00e61d --- /dev/null +++ b/youtube_dl/extractor/makerschannel.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class MakersChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849', + 'md5': '624a512c6969236b5967bf9286345ad1', + 'info_dict': { + 'id': '849', + 'ext': 'mp4', + 'title': 'Landing a bus on a plane is an epic win', + 'uploader': 'ZoomIn', + 'description': 'md5:cd9cca2ea7b69b78be81d07020c97139', + } + } + + def _real_extract(self, url): + id_type, url_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, url_id) + video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data') + + def extract_data_val(attr, fatal=False): + return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) + minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') + + return { + '_type': 'url_transparent', + 'url': 'minoto:%s' % minoto_id, + 'id': extract_data_val('video-id', True), + 'title': extract_data_val('title', True), + 'description': extract_data_val('description'), + 'thumbnail': extract_data_val('image'), + 'uploader': extract_data_val('channel'), + } diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 67d6271e1..c31e8798a 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -17,7 +17,7 @@ from ..utils import ( class MetacafeIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = 'metacafe' diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py new file mode 100644 index 000000000..959a10589 --- /dev/null +++ b/youtube_dl/extractor/minoto.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class MinotoIE(InfoExtractor): + _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + player_id = mobj.group('player_id') or '1' + video_id = mobj.group('id') + video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id) + video_metadata = video_data['video-metadata'] + formats = [] + for fmt in video_data['video-files']: + fmt_url = fmt.get('url') + if not fmt_url: + continue + container = fmt.get('container') + if container == 'hls': + formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + else: + fmt_profile = fmt.get('profile') or {} + f = { + 'format_id': fmt_profile.get('name-short'), + 'format_note': fmt_profile.get('name'), + 'url': fmt_url, + 'container': container, + 'tbr': int_or_none(fmt.get('bitrate')), + 'filesize': int_or_none(fmt.get('filesize')), + 'width': int_or_none(fmt.get('width')), + 'height': int_or_none(fmt.get('height')), + } + codecs = fmt.get('codecs') + if codecs: + codecs = codecs.split(',') + if len(codecs) == 2: + f.update({ + 'vcodec': codecs[0], + 'acodec': codecs[1], + }) + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_metadata['title'], + 'description': video_metadata.get('description'), + 'thumbnail': video_metadata.get('video-poster', {}).get('url'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 29ca45778..1aea78d11 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -91,7 +91,7 @@ class MITIE(TechTVMITIE): class OCWMITIE(InfoExtractor): IE_NAME = 'ocw.mit.edu' - _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' + _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' _BASE_URL = 'http://ocw.mit.edu/' _TESTS = [ @@ -99,7 +99,7 @@ class OCWMITIE(InfoExtractor): 'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/', 'info_dict': { 'id': 'EObHWIEKGjA', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence', 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.', 'upload_date': '20121109', diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index c595f2077..9e584860a 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -14,7 +14,7 @@ from ..utils import ( class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' - _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' + _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index c2b7ed9ab..101497118 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -7,6 +7,7 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, HEADRequest, + parse_count, str_to_int, ) @@ -85,8 +86,8 @@ class MixcloudIE(InfoExtractor): uploader_id = self._search_regex( r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) description = self._og_search_description(webpage) - like_count = str_to_int(self._search_regex( - r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"', + like_count = parse_count(self._search_regex( + r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)', webpage, 'like count', fatal=False)) view_count = str_to_int(self._search_regex( [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', diff --git a/youtube_dl/extractor/mnet.py b/youtube_dl/extractor/mnet.py new file mode 100644 index 000000000..e3f42e7bd --- /dev/null +++ b/youtube_dl/extractor/mnet.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) + + +class MnetIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mnet\.(?:com|interest\.me)/tv/vod/(?:.*?\bclip_id=)?(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.mnet.com/tv/vod/171008', + 'info_dict': { + 'id': '171008', + 'title': 'SS_이해인@히든박스', + 'description': 'md5:b9efa592c3918b615ba69fe9f8a05c55', + 'duration': 88, + 'upload_date': '20151231', + 'timestamp': 1451564040, + 'age_limit': 0, + 'thumbnails': 'mincount:5', + 'thumbnail': 're:^https?://.*\.jpg$', + 'ext': 'flv', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://mnet.interest.me/tv/vod/172790', + 'only_matching': True, + }, { + 'url': 'http://www.mnet.com/tv/vod/vod_view.asp?clip_id=172790&tabMenu=', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._download_json( + 'http://content.api.mnet.com/player/vodConfig?id=%s&ctype=CLIP' % video_id, + video_id, 'Downloading vod config JSON')['data']['info'] + + title = info['title'] + + rtmp_info = self._download_json( + info['cdn'], video_id, 'Downloading vod cdn JSON') + + formats = [{ + 'url': rtmp_info['serverurl'] + rtmp_info['fileurl'], + 'ext': 'flv', + 'page_url': url, + 'player_url': 'http://flvfile.mnet.com/service/player/201602/cjem_player_tv.swf?v=201602191318', + }] + + description = info.get('ment') + duration = parse_duration(info.get('time')) + timestamp = parse_iso8601(info.get('date'), delimiter=' ') + age_limit = info.get('adult') + if age_limit is not None: + age_limit = 0 if age_limit == 'N' else 18 + thumbnails = [{ + 'id': thumb_format, + 'url': thumb['url'], + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + } for thumb_format, thumb in info.get('cover', {}).items() if thumb.get('url')] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py index 7cc7f054f..f010f52d5 100644 --- a/youtube_dl/extractor/mooshare.py +++ b/youtube_dl/extractor/mooshare.py @@ -13,7 +13,7 @@ from ..utils import ( class MooshareIE(InfoExtractor): IE_NAME = 'mooshare' IE_DESC = 'Mooshare.biz' - _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})' + _VALID_URL = r'https?://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})' _TESTS = [ { diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 0b4787c1d..5e1a8a71a 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -12,7 +12,7 @@ from ..utils import ( class MotherlessIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' _TESTS = [{ 'url': 'http://motherless.com/AC3FFE1', 'md5': '310f62e325a9fafe64f68c0bccb6e75f', @@ -69,6 +69,9 @@ class MotherlessIE(InfoExtractor): ">The page you're looking for cannot be found.<")): raise ExtractorError('Video %s does not exist' % video_id, expected=True) + if '>The content you are trying to view is for friends only.' in webpage: + raise ExtractorError('Video %s is for friends only' % video_id, expected=True) + title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') video_url = self._html_search_regex( diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index c1a482dba..370328b36 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -9,7 +9,7 @@ from ..compat import ( class MotorsportIE(InfoExtractor): IE_DESC = 'motorsport.com' - _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' + _VALID_URL = r'https?://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/', 'info_dict': { diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index ed068365d..824bbcb4e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -17,6 +17,7 @@ from ..utils import ( unescapeHTML, url_basename, RegexNotFoundError, + xpath_text, ) @@ -130,11 +131,7 @@ class MTVServicesInfoExtractor(InfoExtractor): message += item.text raise ExtractorError(message, expected=True) - description_node = itemdoc.find('description') - if description_node is not None: - description = description_node.text.strip() - else: - description = None + description = xpath_text(itemdoc, 'description') title_el = None if title_el is None: diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index f936b92bb..1ca7b1a9e 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -11,7 +11,7 @@ from ..utils import ( class MySpassIE(InfoExtractor): - _VALID_URL = r'http://www\.myspass\.de/.*' + _VALID_URL = r'https?://www\.myspass\.de/.*' _TEST = { 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', 'md5': '0b49f4844a068f8b33f4b7c88405862b', diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index 1e21cf98a..c83a1eab5 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -20,7 +20,7 @@ from ..utils import ( class MyVideoIE(InfoExtractor): _WORKING = False - _VALID_URL = r'http://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*' + _VALID_URL = r'https?://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*' IE_NAME = 'myvideo' _TEST = { 'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py index a94ab8358..731c24542 100644 --- a/youtube_dl/extractor/myvidster.py +++ b/youtube_dl/extractor/myvidster.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class MyVidsterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/' + _VALID_URL = r'https?://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/' _TEST = { 'url': 'http://www.myvidster.com/video/32059805/Hot_chemistry_with_raw_love_making', diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 6fc9e7b05..d5e53365c 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -8,7 +8,7 @@ from ..utils import ( class NationalGeographicIE(InfoExtractor): - _VALID_URL = r'http://video\.nationalgeographic\.com/.*?' + _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?' _TESTS = [ { @@ -48,7 +48,7 @@ class NationalGeographicIE(InfoExtractor): theplatform_id = url_basename(content.attrib.get('url')) return self.url_result(smuggle_url( - 'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id, + 'http://link.theplatform.com/s/ngs/%s?formats=MPEG4&manifest=f4m' % theplatform_id, # For some reason, the normal links don't work and we must force # the use of f4m {'force_smil_url': True})) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 2202cfa33..a622f2212 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -3,13 +3,16 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from .theplatform import ThePlatformIE from ..utils import ( - ExtractorError, find_xpath_attr, lowercase_escape, smuggle_url, unescapeHTML, + update_url_query, + int_or_none, + HEADRequest, + parse_iso8601, ) @@ -112,7 +115,7 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): # Does not include https because its certificate is invalid - _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' + _VALID_URL = r'https?://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' _TEST = { 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', @@ -131,10 +134,10 @@ class NBCSportsIE(InfoExtractor): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') -class NBCNewsIE(InfoExtractor): +class NBCNewsIE(ThePlatformIE): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P<id>\d+)| - (?:watch|feature|nightly-news)/[^/]+/(?P<title>.+)) + ([^/]+/)*(?P<display_id>[^/?]+)) ''' _TESTS = [ @@ -149,15 +152,14 @@ class NBCNewsIE(InfoExtractor): }, }, { - 'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236', - 'md5': 'b2421750c9f260783721d898f4c42063', + 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', + 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': 'I1wpAI_zmhsQ', + 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', }, - 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', @@ -168,17 +170,29 @@ class NBCNewsIE(InfoExtractor): 'title': 'FULL EPISODE: Family Business', 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', }, + 'skip': 'This page is unavailable.', }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d', + 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': 'sekXqyTVnmN3', + 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', }, }, + { + 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', + 'md5': 'a49e173825e5fcd15c13fc297fced39d', + 'info_dict': { + 'id': '529953347624', + 'ext': 'mp4', + 'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', + 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', + }, + 'expected_warnings': ['http-6000 is not available'] + }, { 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', 'only_matching': True, @@ -202,55 +216,86 @@ class NBCNewsIE(InfoExtractor): } else: # "feature" and "nightly-news" pages use theplatform.com - title = mobj.group('title') - webpage = self._download_webpage(url, title) + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + info = None bootstrap_json = self._search_regex( - r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', - webpage, 'bootstrap json', flags=re.MULTILINE) - bootstrap = self._parse_json(bootstrap_json, video_id) - info = bootstrap['results'][0]['video'] - mpxid = info['mpxId'] + r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', + webpage, 'bootstrap json', default=None) + if bootstrap_json: + bootstrap = self._parse_json(bootstrap_json, display_id) + info = bootstrap['results'][0]['video'] + else: + player_instance_json = self._search_regex( + r'videoObj\s*:\s*({.+})', webpage, 'player instance') + info = self._parse_json(player_instance_json, display_id) + video_id = info['mpxId'] + title = info['title'] - base_urls = [ - info['fallbackPlaylistUrl'], - info['associatedPlaylistUrl'], - ] + subtitles = {} + caption_links = info.get('captionLinks') + if caption_links: + for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): + sub_url = caption_links.get(sub_key) + if sub_url: + subtitles.setdefault('en', []).append({ + 'url': sub_url, + 'ext': sub_ext, + }) - for base_url in base_urls: - if not base_url: + formats = [] + for video_asset in info['videoAssets']: + video_url = video_asset.get('publicUrl') + if not video_url: continue - playlist_url = base_url + '?form=MPXNBCNewsAPI' - - try: - all_videos = self._download_json(playlist_url, title) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError): - continue - raise - - if not all_videos or 'videos' not in all_videos: + container = video_asset.get('format') + asset_type = video_asset.get('assetType') or '' + if container == 'ISM' or asset_type == 'FireTV-Once': continue - - try: - info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid) - break - except StopIteration: - continue - - if info is None: - raise ExtractorError('Could not find video in playlists') + elif asset_type == 'OnceURL': + tp_formats, tp_subtitles = self._extract_theplatform_smil( + video_url, video_id) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + else: + tbr = int_or_none(video_asset.get('bitRate'), 1000) + format_id = 'http%s' % ('-%d' % tbr if tbr else '') + video_url = update_url_query( + video_url, {'format': 'redirect'}) + # resolve the url so that we can check availability and detect the correct extension + head = self._request_webpage( + HEADRequest(video_url), video_id, + 'Checking %s url' % format_id, + '%s is not available' % format_id, + fatal=False) + if head: + video_url = head.geturl() + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(video_asset.get('width')), + 'height': int_or_none(video_asset.get('height')), + 'tbr': tbr, + 'container': video_asset.get('format'), + }) + self._sort_formats(formats) return { - '_type': 'url', - # We get the best quality video - 'url': info['videoAssets'][-1]['publicUrl'], - 'ie_key': 'ThePlatform', + 'id': video_id, + 'title': title, + 'description': info.get('description'), + 'thumbnail': info.get('description'), + 'thumbnail': info.get('thumbnail'), + 'duration': int_or_none(info.get('duration')), + 'timestamp': parse_iso8601(info.get('pubDate')), + 'formats': formats, + 'subtitles': subtitles, } class MSNBCIE(InfoExtractor): # https URLs redirect to corresponding http ones - _VALID_URL = r'http://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index d1688457f..aae7aeeeb 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -7,7 +7,7 @@ from ..utils import parse_iso8601 class NextMediaIE(InfoExtractor): IE_DESC = '蘋果日報' - _VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)' + _VALID_URL = r'https?://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199', 'md5': 'dff9fad7009311c421176d1ac90bfe4f', @@ -68,7 +68,7 @@ class NextMediaIE(InfoExtractor): class NextMediaActionNewsIE(NextMediaIE): IE_DESC = '蘋果日報 - 動新聞' - _VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+' + _VALID_URL = r'https?://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+' _TESTS = [{ 'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460', 'md5': '05fce8ffeed7a5e00665d4b7cf0f9201', @@ -93,7 +93,7 @@ class NextMediaActionNewsIE(NextMediaIE): class AppleDailyIE(NextMediaIE): IE_DESC = '臺灣蘋果日報' - _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' + _VALID_URL = r'https?://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index d440313d5..ec7317a2f 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -22,7 +22,7 @@ from ..utils import ( class NocoIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' _LOGIN_URL = 'http://noco.tv/do.php' _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s' _SUB_LANG_TEMPLATE = '&sub_lang=%s' diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 5952d136f..77e091072 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -9,7 +9,7 @@ from ..utils import ( class NormalbootsIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' + _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' _TEST = { 'url': 'http://normalboots.com/video/home-alone-games-jontron/', 'md5': '8bf6de238915dd501105b44ef5f1e0f6', diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 3f9c776ef..17671ad39 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -12,7 +12,7 @@ from ..utils import ( class NovaIE(InfoExtractor): IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' - _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' + _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' _TESTS = [{ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', 'info_dict': { diff --git a/youtube_dl/extractor/noz.py b/youtube_dl/extractor/noz.py index 0ffb44b47..c47a33d15 100644 --- a/youtube_dl/extractor/noz.py +++ b/youtube_dl/extractor/noz.py @@ -2,10 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_xpath, +) from ..utils import ( int_or_none, + find_xpath_attr, xpath_text, + update_url_query, ) @@ -45,18 +50,33 @@ class NozIE(InfoExtractor): duration = int_or_none(xpath_text( doc, './/article/movie/file/duration')) formats = [] - for qnode in doc.findall('.//article/movie/file/qualities/qual'): - video_node = qnode.find('./html_urls/video_url[@format="video/mp4"]') - if video_node is None: - continue # auto - formats.append({ - 'url': video_node.text, - 'format_name': xpath_text(qnode, './name'), - 'format_id': xpath_text(qnode, './id'), - 'height': int_or_none(xpath_text(qnode, './height')), - 'width': int_or_none(xpath_text(qnode, './width')), - 'tbr': int_or_none(xpath_text(qnode, './bitrate'), scale=1000), - }) + for qnode in doc.findall(compat_xpath('.//article/movie/file/qualities/qual')): + http_url_ele = find_xpath_attr( + qnode, './html_urls/video_url', 'format', 'video/mp4') + http_url = http_url_ele.text if http_url_ele is not None else None + if http_url: + formats.append({ + 'url': http_url, + 'format_name': xpath_text(qnode, './name'), + 'format_id': '%s-%s' % ('http', xpath_text(qnode, './id')), + 'height': int_or_none(xpath_text(qnode, './height')), + 'width': int_or_none(xpath_text(qnode, './width')), + 'tbr': int_or_none(xpath_text(qnode, './bitrate'), scale=1000), + }) + else: + f4m_url = xpath_text(qnode, 'url_hd2') + if f4m_url: + formats.extend(self._extract_f4m_formats( + update_url_query(f4m_url, {'hdcore': '3.4.0'}), + video_id, f4m_id='hds', fatal=False)) + m3u8_url_ele = find_xpath_attr( + qnode, './html_urls/video_url', + 'format', 'application/vnd.apple.mpegurl') + m3u8_url = m3u8_url_ele.text if m3u8_url_ele is not None else None + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 125c7010b..a3f0abb4e 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -9,7 +9,7 @@ from ..utils import ( class NprIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205', 'info_dict': { diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py index 2cd924d05..0895d7ea4 100644 --- a/youtube_dl/extractor/ntvru.py +++ b/youtube_dl/extractor/ntvru.py @@ -11,7 +11,7 @@ from ..utils import ( class NTVRuIE(InfoExtractor): IE_NAME = 'ntv.ru' - _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?P<id>.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 7f254b867..681683e86 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -18,8 +18,9 @@ class NYTimesBaseIE(InfoExtractor): description = video_data.get('summary') duration = float_or_none(video_data.get('duration'), 1000) - uploader = video_data['byline'] - timestamp = parse_iso8601(video_data['publication_date'][:-8]) + uploader = video_data.get('byline') + publication_date = video_data.get('publication_date') + timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None def get_file_size(file_size): if isinstance(file_size, int): @@ -37,7 +38,7 @@ class NYTimesBaseIE(InfoExtractor): 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), 'filesize': get_file_size(video.get('fileSize')), - } for video in video_data['renditions'] + } for video in video_data['renditions'] if video.get('url') ] self._sort_formats(formats) @@ -46,7 +47,7 @@ class NYTimesBaseIE(InfoExtractor): 'url': 'http://www.nytimes.com/%s' % image['url'], 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), - } for image in video_data['images'] + } for image in video_data.get('images', []) if image.get('url') ] return { diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py new file mode 100644 index 000000000..5db949b17 --- /dev/null +++ b/youtube_dl/extractor/once.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class OnceIE(InfoExtractor): + _VALID_URL = r'https?://once\.unicornmedia\.com/now/[^/]+/[^/]+/(?P<domain_id>[^/]+)/(?P<application_id>[^/]+)/(?:[^/]+/)?(?P<media_item_id>[^/]+)/content\.(?:once|m3u8|mp4)' + ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8' + PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4' + + def _extract_once_formats(self, url): + domain_id, application_id, media_item_id = re.match( + OnceIE._VALID_URL, url).groups() + formats = self._extract_m3u8_formats( + self.ADAPTIVE_URL_TEMPLATE % ( + domain_id, application_id, media_item_id), + media_item_id, 'mp4', m3u8_id='hls', fatal=False) + progressive_formats = [] + for adaptive_format in formats: + # Prevent advertisement from embedding into m3u8 playlist (see + # https://github.com/rg3/youtube-dl/issues/8893#issuecomment-199912684) + adaptive_format['url'] = re.sub( + r'\badsegmentlength=\d+', r'adsegmentlength=0', adaptive_format['url']) + rendition_id = self._search_regex( + r'/now/media/playlist/[^/]+/[^/]+/([^/]+)', + adaptive_format['url'], 'redition id', default=None) + if rendition_id: + progressive_format = adaptive_format.copy() + progressive_format.update({ + 'url': self.PROGRESSIVE_URL_TEMPLATE % ( + domain_id, application_id, rendition_id, media_item_id), + 'format_id': adaptive_format['format_id'].replace( + 'hls', 'http'), + 'protocol': 'http', + }) + progressive_formats.append(progressive_format) + self._check_formats(progressive_formats, media_item_id) + formats.extend(progressive_formats) + return formats diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py new file mode 100644 index 000000000..4468f31fc --- /dev/null +++ b/youtube_dl/extractor/openload.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_chr +from ..utils import ( + encode_base_n, + ExtractorError, +) + + +class OpenloadIE(InfoExtractor): + _VALID_URL = r'https://openload.(?:co|io)/(?:f|embed)/(?P<id>[a-zA-Z0-9-]+)' + + _TESTS = [{ + 'url': 'https://openload.co/f/kUEfGclsU9o', + 'md5': 'bf1c059b004ebc7a256f89408e65c36e', + 'info_dict': { + 'id': 'kUEfGclsU9o', + 'ext': 'mp4', + 'title': 'skyrim_no-audio_1080.mp4', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4', + 'only_matching': True, + }, { + 'url': 'https://openload.io/f/ZAn6oz-VZGE/', + 'only_matching': True, + }] + + @staticmethod + def openload_level2_debase(m): + radix, num = int(m.group(1)) + 27, int(m.group(2)) + return '"' + encode_base_n(num, radix) + '"' + + @classmethod + def openload_level2(cls, txt): + # The function name is ǃ \u01c3 + # Using escaped unicode literals does not work in Python 3.2 + return re.sub(r'ǃ\((\d+),(\d+)\)', cls.openload_level2_debase, txt, re.UNICODE).replace('"+"', '') + + # Openload uses a variant of aadecode + # openload_decode and related functions are originally written by + # vitas@matfyz.cz and released with public domain + # See https://github.com/rg3/youtube-dl/issues/8489 + @classmethod + def openload_decode(cls, txt): + symbol_table = [ + ('_', '(゚Д゚) [゚Θ゚]'), + ('a', '(゚Д゚) [゚ω゚ノ]'), + ('b', '(゚Д゚) [゚Θ゚ノ]'), + ('c', '(゚Д゚) [\'c\']'), + ('d', '(゚Д゚) [゚ー゚ノ]'), + ('e', '(゚Д゚) [゚Д゚ノ]'), + ('f', '(゚Д゚) [1]'), + + ('o', '(゚Д゚) [\'o\']'), + ('u', '(o゚ー゚o)'), + ('c', '(゚Д゚) [\'c\']'), + + ('7', '((゚ー゚) + (o^_^o))'), + ('6', '((o^_^o) +(o^_^o) +(c^_^o))'), + ('5', '((゚ー゚) + (゚Θ゚))'), + ('4', '(-~3)'), + ('3', '(-~-~1)'), + ('2', '(-~1)'), + ('1', '(-~0)'), + ('0', '((c^_^o)-(c^_^o))'), + ] + delim = '(゚Д゚)[゚ε゚]+' + ret = '' + for aachar in txt.split(delim): + for val, pat in symbol_table: + aachar = aachar.replace(pat, val) + aachar = aachar.replace('+ ', '') + m = re.match(r'^\d+', aachar) + if m: + ret += compat_chr(int(m.group(0), 8)) + else: + m = re.match(r'^u([\da-f]+)', aachar) + if m: + ret += compat_chr(int(m.group(1), 16)) + return cls.openload_level2(ret) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if 'File not found' in webpage: + raise ExtractorError('File not found', expected=True) + + code = self._search_regex( + r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>', + webpage, 'JS code') + + video_url = self._search_regex( + r'return\s+"(https?://[^"]+)"', self.openload_decode(code), 'video URL') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'url': video_url, + } diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 958eb398b..66c75f8b3 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -137,7 +137,7 @@ class ORFTVthekIE(InfoExtractor): class ORFOE1IE(InfoExtractor): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'http://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' # Audios on ORF radio are only available for 7 days, so we can't add tests. _TEST = { @@ -171,7 +171,7 @@ class ORFOE1IE(InfoExtractor): class ORFFM4IE(InfoExtractor): IE_NAME = 'orf:fm4' IE_DESC = 'radio FM4' - _VALID_URL = r'http://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' + _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' _TEST = { 'url': 'http://fm4.orf.at/player/20160110/IS/', @@ -222,7 +222,7 @@ class ORFFM4IE(InfoExtractor): class ORFIPTVIE(InfoExtractor): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' - _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' + _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' _TEST = { 'url': 'http://iptv.orf.at/stories/2275236/', diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py index 6e60e5fe9..f1008ae51 100644 --- a/youtube_dl/extractor/philharmoniedeparis.py +++ b/youtube_dl/extractor/philharmoniedeparis.py @@ -12,7 +12,7 @@ from ..utils import ( class PhilharmonieDeParisIE(InfoExtractor): IE_DESC = 'Philharmonie de Paris' - _VALID_URL = r'http://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' + _VALID_URL = r'https?://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', 'info_dict': { diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py index 788411ccc..6c8bbe1d9 100644 --- a/youtube_dl/extractor/photobucket.py +++ b/youtube_dl/extractor/photobucket.py @@ -8,7 +8,7 @@ from ..compat import compat_urllib_parse_unquote class PhotobucketIE(InfoExtractor): - _VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' + _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' _TEST = { 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 3e15533e9..63ce87ee3 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -1,7 +1,10 @@ # encoding: utf-8 from __future__ import unicode_literals -from ..compat import compat_urllib_parse +from ..compat import ( + compat_urllib_parse, + compat_urllib_parse_unquote, +) from .common import InfoExtractor from ..utils import ( parse_duration, @@ -28,9 +31,10 @@ class Porn91IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id self._set_cookie('91porn.com', 'language', 'cn_CN') - webpage = self._download_webpage(url, video_id, 'get HTML content') + + webpage = self._download_webpage( + 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) if '作为游客,你每天只可观看10个视频' in webpage: raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) @@ -54,8 +58,9 @@ class Porn91IE(InfoExtractor): }) info_cn = self._download_webpage( 'http://91porn.com/getfile.php?' + url_params, video_id, - 'get real video url') - video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url') + 'Downloading real video url') + video_url = compat_urllib_parse_unquote(self._search_regex( + r'file=([^&]+)&', info_cn, 'url')) duration = parse_duration(self._search_regex( r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False)) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 57c78ba52..39b53ecf6 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -12,7 +12,7 @@ from ..utils import ( class PornHdIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' + _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' _TEST = { 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'md5': '956b8ca569f7f4d8ec563e2c41598441', diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py index 1a53fd71c..6b51e5c54 100644 --- a/youtube_dl/extractor/pornovoisines.py +++ b/youtube_dl/extractor/pornovoisines.py @@ -13,7 +13,7 @@ from ..utils import ( class PornoVoisinesIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 6d5732d45..cc0416cb8 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -7,19 +7,19 @@ from .common import InfoExtractor class PyvideoIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' _TESTS = [ { 'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', - 'md5': 'de317418c8bc76b1fd8633e4f32acbc6', + 'md5': '520915673e53a5c5d487c36e0c4d85b5', 'info_dict': { 'id': '24_4WWkSmNo', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Become a logging expert in 30 minutes', 'description': 'md5:9665350d466c67fb5b1598de379021f7', 'upload_date': '20130320', - 'uploader': 'NextDayVideo', + 'uploader': 'Next Day Video', 'uploader_id': 'NextDayVideo', }, 'add_ie': ['Youtube'], diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 45a3c41c5..ff0af9543 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -18,7 +18,7 @@ from ..utils import ( class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', 'md5': '9ce1c1c8445f561506d2e3cfb0255705', @@ -172,7 +172,7 @@ class QQPlaylistBaseIE(InfoExtractor): class QQMusicSingerIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:singer' IE_DESC = 'QQ音乐 - 歌手' - _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' _TEST = { 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', 'info_dict': { @@ -217,7 +217,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', @@ -260,7 +260,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=toplist&p=global_123', @@ -314,7 +314,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' IE_DESC = 'QQ音乐 - 歌单' - _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=taoge&id=3462654915', diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index a4dc5c335..e36ce1aa1 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -18,7 +18,7 @@ from ..utils import ( class RaiTVIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', @@ -175,7 +175,7 @@ class RaiTVIE(InfoExtractor): class RaiIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index d6054d717..7ba41ba59 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -5,7 +5,7 @@ from ..utils import ExtractorError class RedTubeIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.redtube.com/66418', 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py index b1b8800b9..99979ebe1 100644 --- a/youtube_dl/extractor/revision3.py +++ b/youtube_dl/extractor/revision3.py @@ -19,7 +19,7 @@ class Revision3IE(InfoExtractor): 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016', 'md5': 'd94a72d85d0a829766de4deb8daaf7df', 'info_dict': { - 'id': '73034', + 'id': '71089', 'display_id': 'technobuffalo/5-google-predictions-for-2016', 'ext': 'webm', 'title': '5 Google Predictions for 2016', @@ -31,6 +31,7 @@ class Revision3IE(InfoExtractor): 'uploader_id': 'technobuffalo', } }, { + # Show 'url': 'http://testtube.com/brainstuff', 'info_dict': { 'id': '251', @@ -41,7 +42,7 @@ class Revision3IE(InfoExtractor): }, { 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial', 'info_dict': { - 'id': '60163', + 'id': '58227', 'display_id': 'dnews/5-weird-ways-plants-can-eat-animals', 'duration': 275, 'ext': 'webm', @@ -52,18 +53,72 @@ class Revision3IE(InfoExtractor): 'uploader': 'DNews', 'uploader_id': 'dnews', }, + }, { + 'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', + 'info_dict': { + 'id': '71618', + 'ext': 'mp4', + 'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', + 'title': 'The Israel-Palestine Conflict Explained in Ten Minutes', + 'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start', + 'uploader': 'Editors\' Picks', + 'uploader_id': 'tt-editors-picks', + 'timestamp': 1453309200, + 'upload_date': '20160120', + }, + 'add_ie': ['Youtube'], + }, { + # Tag + 'url': 'http://testtube.com/tech-news', + 'info_dict': { + 'id': '21018', + 'title': 'tech news', + }, + 'playlist_mincount': 9, }] _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s' _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' def _real_extract(self, url): domain, display_id = re.match(self._VALID_URL, url).groups() + site = domain.split('.')[0] page_info = self._download_json( self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id) - if page_info['data']['type'] == 'episode': - episode_data = page_info['data'] - video_id = compat_str(episode_data['video']['data']['id']) + page_data = page_info['data'] + page_type = page_data['type'] + if page_type in ('episode', 'embed'): + show_data = page_data['show']['data'] + page_id = compat_str(page_data['id']) + video_id = compat_str(page_data['video']['data']['id']) + + preference = qualities(['mini', 'small', 'medium', 'large']) + thumbnails = [{ + 'url': image_url, + 'id': image_id, + 'preference': preference(image_id) + } for image_id, image_url in page_data.get('images', {}).items()] + + info = { + 'id': page_id, + 'display_id': display_id, + 'title': unescapeHTML(page_data['name']), + 'description': unescapeHTML(page_data.get('summary')), + 'timestamp': parse_iso8601(page_data.get('publishTime'), ' '), + 'author': page_data.get('author'), + 'uploader': show_data.get('name'), + 'uploader_id': show_data.get('slug'), + 'thumbnails': thumbnails, + 'extractor_key': site, + } + + if page_type == 'embed': + info.update({ + '_type': 'url_transparent', + 'url': page_data['video']['data']['embed'], + }) + return info + video_data = self._download_json( 'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id), video_id)['items'][0] @@ -84,36 +139,30 @@ class Revision3IE(InfoExtractor): }) self._sort_formats(formats) - preference = qualities(['mini', 'small', 'medium', 'large']) - thumbnails = [{ - 'url': image_url, - 'id': image_id, - 'preference': preference(image_id) - } for image_id, image_url in video_data.get('images', {}).items()] - - return { - 'id': video_id, - 'display_id': display_id, + info.update({ 'title': unescapeHTML(video_data['title']), 'description': unescapeHTML(video_data.get('summary')), - 'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '), - 'author': episode_data.get('author'), 'uploader': video_data.get('show', {}).get('name'), 'uploader_id': video_data.get('show', {}).get('slug'), 'duration': int_or_none(video_data.get('duration')), - 'thumbnails': thumbnails, 'formats': formats, - } + }) + return info else: - show_data = page_info['show']['data'] + list_data = page_info[page_type]['data'] episodes_data = page_info['episodes']['data'] num_episodes = page_info['meta']['totalEpisodes'] processed_episodes = 0 entries = [] page_num = 1 while True: - entries.extend([self.url_result( - 'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data]) + entries.extend([{ + '_type': 'url', + 'url': 'http://%s%s' % (domain, episode['path']), + 'id': compat_str(episode['id']), + 'ie_key': 'Revision3', + 'extractor_key': site, + } for episode in episodes_data]) processed_episodes += len(episodes_data) if processed_episodes == num_episodes: break @@ -123,5 +172,5 @@ class Revision3IE(InfoExtractor): display_id)['episodes']['data'] return self.playlist_result( - entries, compat_str(show_data['id']), - show_data.get('name'), show_data.get('summary')) + entries, compat_str(list_data['id']), + list_data.get('name'), list_data.get('summary')) diff --git a/youtube_dl/extractor/rice.py b/youtube_dl/extractor/rice.py new file mode 100644 index 000000000..f855719ac --- /dev/null +++ b/youtube_dl/extractor/rice.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( + xpath_text, + xpath_element, + int_or_none, + parse_iso8601, + ExtractorError, +) + + +class RICEIE(InfoExtractor): + _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)' + _TEST = { + 'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw', + 'md5': '9b83b4a2eead4912dc3b7fac7c449b6a', + 'info_dict': { + 'id': 'YEWIvbhb40aqdjMD1ALSqw', + 'ext': 'mp4', + 'title': 'Active Learning in Archeology', + 'upload_date': '20140616', + 'timestamp': 1402926346, + } + } + _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config' + + def _real_extract(self, url): + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) + if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'): + raise ExtractorError('Invalid URL', expected=True) + + portal_id = qs['PortalID'][0] + playlist_id = qs['DestinationID'][0] + content_id = qs['ContentID'][0] + + content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={ + 'portalId': portal_id, + 'playlistId': playlist_id, + 'contentId': content_id + }) + metadata = xpath_element(content_data, './/metaData', fatal=True) + title = xpath_text(metadata, 'primaryTitle', fatal=True) + encodings = xpath_element(content_data, './/encodings', fatal=True) + player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={ + 'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True), + 'contentId': content_id, + }) + + common_fmt = {} + dimensions = xpath_text(encodings, 'dimensions') + if dimensions: + wh = dimensions.split('x') + if len(wh) == 2: + common_fmt.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + + formats = [] + rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS)) + if rtsp_path: + fmt = { + 'url': rtsp_path, + 'format_id': 'rtsp', + } + fmt.update(common_fmt) + formats.append(fmt) + for source in player_data.findall(self._xpath_ns('.//Source', self._NS)): + video_url = xpath_text(source, self._xpath_ns('File', self._NS)) + if not video_url: + continue + if '.m3u8' in video_url: + formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + fmt = { + 'url': video_url, + 'format_id': video_url.split(':')[0], + } + fmt.update(common_fmt) + rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + }) + formats.append(fmt) + self._sort_formats(formats) + + thumbnails = [] + for content_asset in content_data.findall('.//contentAssets'): + asset_type = xpath_text(content_asset, 'type') + if asset_type == 'image': + image_url = xpath_text(content_asset, 'httpPath') + if not image_url: + continue + thumbnails.append({ + 'id': xpath_text(content_asset, 'ID'), + 'url': image_url, + }) + + return { + 'id': content_id, + 'title': title, + 'description': xpath_text(metadata, 'abstract'), + 'duration': int_or_none(xpath_text(metadata, 'duration')), + 'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py index 508758075..2c2c707bd 100644 --- a/youtube_dl/extractor/ringtv.py +++ b/youtube_dl/extractor/ringtv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class RingTVIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' _TEST = { 'url': 'http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30', 'md5': 'd25945f5df41cdca2d2587165ac28720', diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 8a8c5d2a0..08cd1ae6c 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -62,7 +62,7 @@ def _decrypt_url(png): class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' - _VALID_URL = r'http://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', @@ -179,7 +179,7 @@ class RTVEInfantilIE(InfoExtractor): class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'http://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' + _VALID_URL = r'https?://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py index 0e470e73f..1f7c26299 100644 --- a/youtube_dl/extractor/ruhd.py +++ b/youtube_dl/extractor/ruhd.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class RUHDIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' _TEST = { 'url': 'http://www.ruhd.ru/play.php?vid=207', 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index c5c47d01e..9ca4ae147 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -122,7 +122,7 @@ class RutubeEmbedIE(InfoExtractor): class RutubeChannelIE(InfoExtractor): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' - _VALID_URL = r'http://rutube\.ru/tags/video/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/tags/video/1800/', 'info_dict': { @@ -156,7 +156,7 @@ class RutubeChannelIE(InfoExtractor): class RutubeMovieIE(RutubeChannelIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' - _VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' _TESTS = [] _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' @@ -174,7 +174,7 @@ class RutubeMovieIE(RutubeChannelIE): class RutubePersonIE(RutubeChannelIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' - _VALID_URL = r'http://rutube\.ru/video/person/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/video/person/313878/', 'info_dict': { diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index f7fe1fece..a2379eb04 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -14,7 +14,7 @@ class RUTVIE(InfoExtractor): IE_DESC = 'RUTV.RU' _VALID_URL = r'''(?x) https?://player\.(?:rutv\.ru|vgtrk\.com)/ - (?P<path>flash2v/container\.swf\?id= + (?P<path>flash\d+v/container\.swf\?id= |iframe/(?P<type>swf|video|live)/id/ |index/iframe/cast_id/) (?P<id>\d+)''' @@ -109,7 +109,7 @@ class RUTVIE(InfoExtractor): return mobj.group('url') mobj = re.search( - r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)', + r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', webpage) if mobj: return mobj.group('url') @@ -119,7 +119,7 @@ class RUTVIE(InfoExtractor): video_id = mobj.group('id') video_path = mobj.group('path') - if video_path.startswith('flash2v'): + if re.match(r'flash\d+v', video_path): video_type = 'video' elif video_path.startswith('iframe'): video_type = mobj.group('type') @@ -168,7 +168,7 @@ class RUTVIE(InfoExtractor): 'play_path': mobj.group('playpath'), 'app': mobj.group('app'), 'page_url': 'http://player.rutv.ru', - 'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22', + 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', 'rtmp_live': True, 'ext': 'flv', 'vbr': int(quality), diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 7de7b7273..6ba91f202 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -4,14 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE from ..utils import ( ExtractorError, sanitized_Request, - smuggle_url, std_headers, urlencode_postdata, + update_url_query, ) @@ -20,28 +19,30 @@ class SafariBaseIE(InfoExtractor): _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>' _NETRC_MACHINE = 'safari' - _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' + _API_BASE = 'https://www.safaribooksonline.com/api/v1' _API_FORMAT = 'json' LOGGED_IN = False def _real_initialize(self): - # We only need to log in once for courses or individual videos - if not self.LOGGED_IN: - self._login() - SafariBaseIE.LOGGED_IN = True + self._login() def _login(self): + # We only need to log in once for courses or individual videos + if self.LOGGED_IN: + return + (username, password) = self._get_login_info() if username is None: - self.raise_login_required('safaribooksonline.com account is required') + return - headers = std_headers + headers = std_headers.copy() if 'Referer' not in headers: headers['Referer'] = self._LOGIN_URL + login_page_request = sanitized_Request(self._LOGIN_URL, headers=headers) login_page = self._download_webpage( - self._LOGIN_URL, None, + login_page_request, None, 'Downloading login form') csrf = self._html_search_regex( @@ -66,35 +67,27 @@ class SafariBaseIE(InfoExtractor): 'Login failed; make sure your credentials are correct and try again.', expected=True) + SafariBaseIE.LOGGED_IN = True + self.to_screen('Login successful') class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'''(?x)https?:// - (?:www\.)?safaribooksonline\.com/ - (?: - library/view/[^/]+| - api/v1/book - )/ - (?P<course_id>[^/]+)/ - (?:chapter(?:-content)?/)? - (?P<part>part\d+)\.html - ''' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>part\d+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', - 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', + 'md5': 'dcc5a425e79f2564148652616af1f2a3', 'info_dict': { - 'id': '2842601850001', + 'id': '0_qbqx90ic', 'ext': 'mp4', - 'title': 'Introduction', + 'title': 'Introduction to Hadoop Fundamentals LiveLessons', + 'timestamp': 1437758058, + 'upload_date': '20150724', + 'uploader_id': 'stork', }, - 'skip': 'Requires safaribooksonline account credentials', - }, { - 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', - 'only_matching': True, }, { # non-digits in course id 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', @@ -103,18 +96,55 @@ class SafariIE(SafariBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - course_id = mobj.group('course_id') - part = mobj.group('part') + video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part')) - webpage = self._download_webpage( - '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), - part) + webpage = self._download_webpage(url, video_id) + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura widget id', group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura uiconf id', group='id') - bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - if not bc_url: - raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) + query = { + 'wid': '_%s' % partner_id, + 'uiconf_id': ui_id, + 'flashvars[referenceId]': reference_id, + } - return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy') + if self.LOGGED_IN: + kaltura_session = self._download_json( + '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), + video_id, 'Downloading kaltura session JSON', + 'Unable to download kaltura session JSON', fatal=False) + if kaltura_session: + session = kaltura_session.get('session') + if session: + query['flashvars[ks]'] = session + + return self.url_result(update_url_query( + 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query), + 'Kaltura') + + +class SafariApiIE(SafariBaseIE): + IE_NAME = 'safari:api' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>part\d+)\.html' + + _TEST = { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + part = self._download_json( + url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), + 'Downloading part JSON') + return self.url_result(part['web_url'], SafariIE.ie_key()) class SafariCourseIE(SafariBaseIE): @@ -140,7 +170,7 @@ class SafariCourseIE(SafariBaseIE): course_id = self._match_id(url) course_json = self._download_json( - '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), + '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), course_id, 'Downloading course JSON') if 'chapters' not in course_json: @@ -148,7 +178,7 @@ class SafariCourseIE(SafariBaseIE): 'No chapters found for course %s' % course_id, expected=True) entries = [ - self.url_result(chapter, 'Safari') + self.url_result(chapter, SafariApiIE.ie_key()) for chapter in course_json['chapters']] course_title = course_json['title'] diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index d6ee2d9e2..2f96477ca 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + smuggle_url, + ExtractorError, +) class SBSIE(InfoExtractor): @@ -31,21 +35,28 @@ class SBSIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + player_params = self._download_json( + 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id) - webpage = self._download_webpage( - 'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id) - - player_params = self._parse_json( - self._search_regex( - r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'), - video_id) + error = player_params.get('error') + if error: + error_message = 'Sorry, The video you are looking for does not exist.' + video_data = error.get('results') or {} + error_code = error.get('errorCode') + if error_code == 'ComingSoon': + error_message = '%s is not yet available.' % video_data.get('title', '') + elif error_code in ('Forbidden', 'intranetAccessOnly'): + error_message = 'Sorry, This video cannot be accessed via this website' + elif error_code == 'Expired': + error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '') + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) urls = player_params['releaseUrls'] - theplatform_url = (urls.get('progressive') or urls.get('standard') or - urls.get('html') or player_params['relatedItemsURL']) + theplatform_url = (urls.get('progressive') or urls.get('html') or + urls.get('standard') or player_params['relatedItemsURL']) return { '_type': 'url_transparent', 'id': video_id, - 'url': theplatform_url, + 'url': smuggle_url(theplatform_url, {'force_smil_url': True}), } diff --git a/youtube_dl/extractor/screenjunkies.py b/youtube_dl/extractor/screenjunkies.py index f2af15f6b..dd0a6ba19 100644 --- a/youtube_dl/extractor/screenjunkies.py +++ b/youtube_dl/extractor/screenjunkies.py @@ -11,7 +11,7 @@ from ..utils import ( class ScreenJunkiesIE(InfoExtractor): - _VALID_URL = r'http://www.screenjunkies.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' + _VALID_URL = r'https?://www.screenjunkies.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915', 'md5': '5c2b686bec3d43de42bde9ec047536b0', diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 4d3b58522..c5f474dd1 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor): ['arch', '', 'http://ussenate-f.akamaihd.net/'] ] _IE_NAME = 'senate.gov' - _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)' + _VALID_URL = r'https?://www\.senate\.gov/isvp/?\?(?P<qs>.+)' _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py index 6365a8779..a99b2a8e7 100644 --- a/youtube_dl/extractor/sexu.py +++ b/youtube_dl/extractor/sexu.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -14,7 +12,7 @@ class SexuIE(InfoExtractor): 'id': '961791', 'ext': 'mp4', 'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b', - 'description': 'md5:c5ed8625eb386855d5a7967bd7b77a54', + 'description': 'md5:2b75327061310a3afb3fbd7d09e2e403', 'categories': list, # NSFW 'thumbnail': 're:https?://.*\.jpg$', 'age_limit': 18, @@ -25,13 +23,18 @@ class SexuIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - quality_arr = self._search_regex( - r'sources:\s*\[([^\]]+)\]', webpage, 'forrmat string') + jwvideo = self._parse_json( + self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'), + video_id) + + sources = jwvideo['sources'] + formats = [{ - 'url': fmt[0].replace('\\', ''), - 'format_id': fmt[1], - 'height': int(fmt[1][:3]), - } for fmt in re.findall(r'"file":"([^"]+)","label":"([^"]+)"', quality_arr)] + 'url': source['file'].replace('\\', ''), + 'format_id': source.get('label'), + 'height': self._search_regex( + r'^(\d+)[pP]', source.get('label', ''), 'height', default=None), + } for source in sources if source.get('file')] self._sort_formats(formats) title = self._html_search_regex( @@ -40,9 +43,7 @@ class SexuIE(InfoExtractor): description = self._html_search_meta( 'description', webpage, 'description') - thumbnail = self._html_search_regex( - r'image:\s*"([^"]+)"', - webpage, 'thumbnail', fatal=False) + thumbnail = jwvideo.get('image') categories_str = self._html_search_meta( 'keywords', webpage, 'categories') diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 8eda3c864..96fe0b90d 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -13,7 +13,7 @@ from ..utils import ( class SharedIE(InfoExtractor): IE_DESC = 'shared.sx and vivo.sx' - _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})' + _VALID_URL = r'https?://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})' _TESTS = [{ 'url': 'http://shared.sx/0060718775', diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py index dfe50ed45..7e6783306 100644 --- a/youtube_dl/extractor/sport5.py +++ b/youtube_dl/extractor/sport5.py @@ -8,7 +8,7 @@ from ..utils import ExtractorError class Sport5IE(InfoExtractor): - _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' _TESTS = [ { 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/ssa.py index 13101c714..54d1843f2 100644 --- a/youtube_dl/extractor/ssa.py +++ b/youtube_dl/extractor/ssa.py @@ -8,7 +8,7 @@ from ..utils import ( class SSAIE(InfoExtractor): - _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)' + _VALID_URL = r'https?://ssa\.nls\.uk/film/(?P<id>\d+)' _TEST = { 'url': 'http://ssa.nls.uk/film/3561', 'info_dict': { diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 399c3b8ee..2ab30e45f 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -19,20 +19,25 @@ class SVTBaseIE(InfoExtractor): video_info = info['video'] formats = [] for vr in video_info['videoReferences']: + player_type = vr.get('playerType') vurl = vr['url'] ext = determine_ext(vurl) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( vurl, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=vr.get('playerType'))) + m3u8_id=player_type, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( vurl + '?hdcore=3.3.0', video_id, - f4m_id=vr.get('playerType'))) + f4m_id=player_type, fatal=False)) + elif ext == 'mpd': + if player_type == 'dashhbbtv': + formats.extend(self._extract_mpd_formats( + vurl, video_id, mpd_id=player_type, fatal=False)) else: formats.append({ - 'format_id': vr.get('playerType'), + 'format_id': player_type, 'url': vurl, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index aa5964acb..f562aa6d3 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class SztvHuIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' _TEST = { 'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', 'md5': 'a6df607b11fb07d0e9f2ad94613375cb', diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index d1b7264b4..b49ab5f5b 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -16,7 +16,7 @@ from ..compat import compat_ord class TeamcocoIE(InfoExtractor): - _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' + _VALID_URL = r'https?://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' _TESTS = [ { 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index a48d77c30..cf8851438 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -73,7 +73,7 @@ class TEDIE(InfoExtractor): 'add_ie': ['Youtube'], 'info_dict': { 'id': '_ZG8HBuDjgc', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Douglas Adams: Parrots the Universe and Everything', 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', 'uploader': 'University of California Television (UCTV)', diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py index 4e860db0a..a29a64b6d 100644 --- a/youtube_dl/extractor/tele13.py +++ b/youtube_dl/extractor/tele13.py @@ -11,7 +11,7 @@ from ..utils import ( class Tele13IE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' _TESTS = [ { 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 9ee844684..3f54b2744 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' + _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 93d871571..863914299 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -8,13 +8,12 @@ import binascii import hashlib -from .common import InfoExtractor +from .once import OnceIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, ) from ..utils import ( - determine_ext, ExtractorError, float_or_none, int_or_none, @@ -22,36 +21,34 @@ from ..utils import ( unsmuggle_url, xpath_with_ns, mimetype2ext, + find_xpath_attr, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) -class ThePlatformBaseIE(InfoExtractor): +class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): - meta = self._download_xml(smil_url, video_id, note=note) - try: - error_msg = next( - n.attrib['abstract'] - for n in meta.findall(_x('.//smil:ref')) - if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired') - except StopIteration: - pass - else: - raise ExtractorError(error_msg, expected=True) + meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'}) + error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') + if error_element is not None and error_element.attrib['src'].startswith( + 'http://link.theplatform.com/s/errorFiles/Unavailable.'): + raise ExtractorError(error_element.attrib['abstract'], expected=True) - formats = self._parse_smil_formats( + smil_formats = self._parse_smil_formats( meta, smil_url, video_id, namespace=default_ns, # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) - for _format in formats: - ext = determine_ext(_format['url']) - if ext == 'once': - _format['ext'] = 'mp4' + formats = [] + for _format in smil_formats: + if OnceIE.suitable(_format['url']): + formats.extend(self._extract_once_formats(_format['url'])) + else: + formats.append(_format) self._sort_formats(formats) @@ -128,7 +125,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'only_matching': True, }, { 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701', - 'md5': '734f3790fb5fc4903da391beeebc4836', + 'md5': 'fb96bb3d85118930a5b055783a3bd992', 'info_dict': { 'id': 'tdy_or_siri_150701', 'ext': 'mp4', @@ -138,7 +135,6 @@ class ThePlatformIE(ThePlatformBaseIE): 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1435752600, 'upload_date': '20150701', - 'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"], }, }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 @@ -216,7 +212,7 @@ class ThePlatformIE(ThePlatformBaseIE): webpage, 'smil url', group='url') path = self._search_regex( r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path') - smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4&format=SMIL' + smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4' elif mobj.group('config'): config_url = url + '&form=json' config_url = config_url.replace('swf/', 'config/') @@ -226,9 +222,9 @@ class ThePlatformIE(ThePlatformBaseIE): release_url = config['releaseUrl'] else: release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path - smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m' + smil_url = release_url + '&formats=MPEG4&manifest=f4m' else: - smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path + smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path sig = smuggled_data.get('sig') if sig: @@ -253,7 +249,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): _TEST = { # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', - 'md5': '22d2b84f058d3586efcd99e57d59d314', + 'md5': '6e32495b5073ab414471b615c5ded394', 'info_dict': { 'id': 'n_hardball_5biden_140207', 'ext': 'mp4', @@ -283,7 +279,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): first_video_id = None duration = None for item in entry['media$content']: - smil_url = item['plfile$url'] + '&format=SMIL&mbr=true' + smil_url = item['plfile$url'] + '&mbr=true' cur_video_id = ThePlatformIE._match_id(smil_url) if first_video_id is None: first_video_id = cur_video_id diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py new file mode 100644 index 000000000..08d666eaf --- /dev/null +++ b/youtube_dl/extractor/thescene.py @@ -0,0 +1,48 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..compat import compat_urllib_parse +from ..utils import qualities + + +class TheSceneIE(InfoExtractor): + _VALID_URL = r'https://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)' + + _TEST = { + 'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear', + 'info_dict': { + 'id': '520e8faac2b4c00e3c6e5f43', + 'ext': 'mp4', + 'title': 'Narciso Rodriguez: Spring 2013 Ready-to-Wear', + 'display_id': 'narciso-rodriguez-spring-2013-ready-to-wear', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_url = compat_urllib_parse.urljoin( + url, + self._html_search_regex( + r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) + + self.to_screen(player_url) + player = self._download_webpage(player_url, player_url) + info = self._parse_json(self._search_regex(r'(?m)var\s+video\s+=\s+({.+?});$', player, 'info json'), display_id) + + qualities_order = qualities(['low', 'high']) + formats = [{ + 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']), + 'url': f['src'], + 'quality': qualities_order(f['quality']), + } for f in info['sources'][0]] + self._sort_formats(formats) + + return { + 'id': info['id'], + 'title': info['title'], + 'formats': formats, + 'thumbnail': info.get('poster_frame'), + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py new file mode 100644 index 000000000..b7e9af2af --- /dev/null +++ b/youtube_dl/extractor/thestar.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import compat_parse_qs + + +class TheStarIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thestar\.com/(?:[^/]+/)*(?P<id>.+)\.html' + _TEST = { + 'url': 'http://www.thestar.com/life/2016/02/01/mankind-why-this-woman-started-a-men-s-skincare-line.html', + 'md5': '2c62dd4db2027e35579fefb97a8b6554', + 'info_dict': { + 'id': '4732393888001', + 'ext': 'mp4', + 'title': 'Mankind: Why this woman started a men\'s skin care line', + 'description': 'Robert Cribb talks to Young Lee, the founder of Uncle Peter\'s MAN.', + 'uploader_id': '794267642001', + 'timestamp': 1454353482, + 'upload_date': '20160201', + } + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/794267642001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 496f15d80..406f4a826 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -10,7 +10,7 @@ from ..utils import ( class THVideoIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)' _TEST = { 'url': 'http://thvideo.tv/v/th1987/', 'md5': 'fa107b1f73817e325e9433505a70db50', diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py index e036b8cdf..c43cace24 100644 --- a/youtube_dl/extractor/tinypic.py +++ b/youtube_dl/extractor/tinypic.py @@ -9,7 +9,7 @@ from ..utils import ExtractorError class TinyPicIE(InfoExtractor): IE_NAME = 'tinypic' IE_DESC = 'tinypic.com videos' - _VALID_URL = r'http://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' + _VALID_URL = r'https?://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' _TESTS = [ { diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index adc05ed5f..abad3ff64 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -4,12 +4,12 @@ import re from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE -from ..compat import compat_urlparse +from ..compat import compat_parse_qs class TlcDeIE(InfoExtractor): IE_NAME = 'tlc.de' - _VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P<title>[^/?]+)' + _VALID_URL = r'https?://www\.tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' _TEST = { 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', @@ -17,32 +17,23 @@ class TlcDeIE(InfoExtractor): 'id': '3235167922001', 'ext': 'mp4', 'title': 'Breaking Amish: Die Welt da draußen', - 'uploader': 'Discovery Networks - Germany', 'description': ( 'Vier Amische und eine Mennonitin wagen in New York' ' den Sprung in ein komplett anderes Leben. Begleitet sie auf' ' ihrem spannenden Weg.'), + 'timestamp': 1396598084, + 'upload_date': '20140404', + 'uploader_id': '1659832546', }, } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - iframe_url = self._search_regex( - '<iframe src="(http://www\.tlc\.de/wp-content/.+?)"', webpage, - 'iframe url') - # Otherwise we don't get the correct 'BrightcoveExperience' element, - # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/ - iframe_url = iframe_url.replace('.htm?', '.php?') - url_fragment = compat_urlparse.urlparse(url).fragment - if url_fragment: - # Since the fragment is not send to the server, we always get the same iframe - iframe_url = re.sub(r'playlist=(\d+)', 'playlist=%s' % url_fragment, iframe_url) - iframe = self._download_webpage(iframe_url, title) - - return { - '_type': 'url', - 'url': BrightcoveLegacyIE._extract_brightcove_url(iframe), - 'ie': BrightcoveLegacyIE.ie_key(), - } + brightcove_id = mobj.group('id') + if not brightcove_id: + title = mobj.group('title') + webpage = self._download_webpage(url, title) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 2756f56d3..2579ba8c6 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -41,7 +41,7 @@ class ToypicsIE(InfoExtractor): class ToypicsUserIE(InfoExtractor): IE_DESC = 'Toypics user profile' - _VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' + _VALID_URL = r'https?://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' _TEST = { 'url': 'http://videos.toypics.net/Mikey', 'info_dict': { diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 0e01b15fc..747370d12 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class TrailerAddictIE(InfoExtractor): _WORKING = False - _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' + _VALID_URL = r'(?:https?://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' _TEST = { 'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer', 'md5': '41365557f3c8c397d091da510e73ceb4', diff --git a/youtube_dl/extractor/trollvids.py b/youtube_dl/extractor/trollvids.py index d239949a6..657705623 100644 --- a/youtube_dl/extractor/trollvids.py +++ b/youtube_dl/extractor/trollvids.py @@ -7,7 +7,7 @@ from .nuevo import NuevoBaseIE class TrollvidsIE(NuevoBaseIE): - _VALID_URL = r'http://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' IE_NAME = 'trollvids' _TEST = { 'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff', diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index f56b66d06..9892e8a62 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + InAdvancePagedList, float_or_none, unescapeHTML, ) @@ -75,15 +76,16 @@ class TudouIE(InfoExtractor): quality = sorted(filter(lambda k: k.isdigit(), segments.keys()), key=lambda k: int(k))[-1] parts = segments[quality] - result = [] len_parts = len(parts) if len_parts > 1: self.to_screen('%s: found %s parts' % (video_id, len_parts)) - for part in parts: + + def part_func(partnum): + part = parts[partnum] part_id = part['k'] final_url = self._url_for_id(part_id, quality) ext = (final_url.split('?')[0]).split('.')[-1] - part_info = { + return [{ 'id': '%s' % part_id, 'url': final_url, 'ext': ext, @@ -97,12 +99,13 @@ class TudouIE(InfoExtractor): 'http_headers': { 'Referer': self._PLAYER_URL, }, - } - result.append(part_info) + }] + + entries = InAdvancePagedList(part_func, len_parts, 1) return { '_type': 'multi_video', - 'entries': result, + 'entries': entries, 'id': video_id, 'title': title, } diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 4f844706d..e5bcf7798 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -8,7 +8,7 @@ from ..utils import int_or_none class TumblrIE(InfoExtractor): - _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' + _VALID_URL = r'https?://(?P<blog_name>.*?)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', @@ -67,6 +67,34 @@ class TumblrIE(InfoExtractor): 'uploader_id': 'user32021558', }, 'add_ie': ['Vimeo'], + }, { + 'url': 'http://sutiblr.tumblr.com/post/139638707273', + 'md5': '2dd184b3669e049ba40563a7d423f95c', + 'info_dict': { + 'id': 'ir7qBEIKqvq', + 'ext': 'mp4', + 'title': 'Vine by sutiblr', + 'alt_title': 'Vine by sutiblr', + 'uploader': 'sutiblr', + 'uploader_id': '1198993975374495744', + 'upload_date': '20160220', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'add_ie': ['Vine'], + }, { + 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or', + 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72', + 'info_dict': { + 'id': '-7LnUPGlSo', + 'ext': 'mp4', + 'title': 'Video by victoriassecret', + 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat', + 'uploader_id': 'victoriassecret', + 'thumbnail': 're:^https?://.*\.jpg' + }, + 'add_ie': ['Instagram'], }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index 8322cc14d..ae4cfaec2 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import json +import re from .common import InfoExtractor from ..utils import ExtractorError @@ -27,10 +27,9 @@ class TuneInBaseIE(InfoExtractor): if not streams_url.startswith('http://'): streams_url = compat_urlparse.urljoin(url, streams_url) - stream_data = self._download_webpage( - streams_url, content_id, note='Downloading stream data') - streams = json.loads(self._search_regex( - r'\((.*)\);', stream_data, 'stream info'))['Streams'] + streams = self._download_json( + streams_url, content_id, note='Downloading stream data', + transform_source=lambda s: re.sub(r'^\s*\((.*)\);\s*$', r'\1', s))['Streams'] is_live = None formats = [] diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 1457e524e..86bb7915d 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -14,7 +14,7 @@ from ..utils import ( class TV2IE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' _TEST = { 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { @@ -100,7 +100,7 @@ class TV2IE(InfoExtractor): class TV2ArticleIE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', 'info_dict': { diff --git a/youtube_dl/extractor/tv3.py b/youtube_dl/extractor/tv3.py new file mode 100644 index 000000000..d3f690dc7 --- /dev/null +++ b/youtube_dl/extractor/tv3.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TV3IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tv3\.co\.nz/(?P<id>[^/]+)/tabid/\d+/articleID/\d+/MCat/\d+/Default\.aspx' + _TEST = { + 'url': 'http://www.tv3.co.nz/MOTORSPORT-SRS-SsangYong-Hampton-Downs-Round-3/tabid/3692/articleID/121615/MCat/2915/Default.aspx', + 'info_dict': { + 'id': '4659127992001', + 'ext': 'mp4', + 'title': 'CRC Motorsport: SRS SsangYong Hampton Downs Round 3 - S2015 Ep3', + 'description': 'SsangYong Racing Series returns for Round 3 with drivers from New Zealand and Australia taking to the grid at Hampton Downs raceway.', + 'uploader_id': '3812193411001', + 'upload_date': '20151213', + 'timestamp': 1449975272, + }, + 'expected_warnings': [ + 'Failed to download MPD manifest' + ], + 'params': { + 'skip_download': True, + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/3812193411001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_id = self._search_regex(r'<param\s*name="@videoPlayer"\s*value="(\d+)"', webpage, 'brightcove id') + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index 3a4f393fc..4065354dd 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -11,7 +11,7 @@ from ..utils import ( class TVCIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' _TEST = { 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702', 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708', @@ -64,7 +64,7 @@ class TVCIE(InfoExtractor): class TVCArticleIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)' + _VALID_URL = r'https?://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)' _TESTS = [{ 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/', 'info_dict': { diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index b4683de54..df70a6b23 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -13,7 +13,7 @@ from ..utils import ( class TVPlayIE(InfoExtractor): IE_DESC = 'TV3Play and related services' - _VALID_URL = r'''(?x)http://(?:www\.)? + _VALID_URL = r'''(?x)https?://(?:www\.)? (?:tvplay\.lv/parraides| tv3play\.lt/programos| play\.tv3\.lt/programos| diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 5b8586097..d4169ec6d 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -17,6 +17,7 @@ from ..utils import ( encode_dict, ExtractorError, int_or_none, + orderedSet, parse_duration, parse_iso8601, sanitized_Request, @@ -251,6 +252,7 @@ class TwitchVodIE(TwitchItemBaseIE): self._USHER_BASE, item_id, compat_urllib_parse.urlencode({ 'allow_source': 'true', + 'allow_audio_only': 'true', 'allow_spectre': 'true', 'player': 'twitchweb', 'nauth': access_token['token'], @@ -281,17 +283,37 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): entries = [] offset = 0 limit = self._PAGE_LIMIT + broken_paging_detected = False + counter_override = None for counter in itertools.count(1): response = self._download_json( self._PLAYLIST_URL % (channel_id, offset, limit), - channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter)) + channel_id, + 'Downloading %s videos JSON page %s' + % (self._PLAYLIST_TYPE, counter_override or counter)) page_entries = self._extract_playlist_page(response) if not page_entries: break + total = int_or_none(response.get('_total')) + # Since the beginning of March 2016 twitch's paging mechanism + # is completely broken on the twitch side. It simply ignores + # a limit and returns the whole offset number of videos. + # Working around by just requesting all videos at once. + # Upd: pagination bug was fixed by twitch on 15.03.2016. + if not broken_paging_detected and total and len(page_entries) > limit: + self.report_warning( + 'Twitch pagination is broken on twitch side, requesting all videos at once', + channel_id) + broken_paging_detected = True + offset = total + counter_override = '(all at once)' + continue entries.extend(page_entries) + if broken_paging_detected or total and len(page_entries) >= total: + break offset += limit return self.playlist_result( - [self.url_result(entry) for entry in set(entries)], + [self.url_result(entry) for entry in orderedSet(entries)], channel_id, channel_name) def _extract_playlist_page(self, response): @@ -303,7 +325,6 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): class TwitchProfileIE(TwitchPlaylistBaseIE): - _WORKING = False IE_NAME = 'twitch:profile' _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE _PLAYLIST_TYPE = 'profile' @@ -319,7 +340,6 @@ class TwitchProfileIE(TwitchPlaylistBaseIE): class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): - _WORKING = False IE_NAME = 'twitch:past_broadcasts' _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true' @@ -336,7 +356,6 @@ class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): class TwitchBookmarksIE(TwitchPlaylistBaseIE): - _WORKING = False IE_NAME = 'twitch:bookmarks' _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE @@ -414,6 +433,7 @@ class TwitchStreamIE(TwitchBaseIE): query = { 'allow_source': 'true', + 'allow_audio_only': 'true', 'p': random.randint(1000000, 10000000), 'player': 'twitchweb', 'segment_preference': '4', diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 5d2b5ec35..e70b2ab3c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -10,7 +10,6 @@ from ..utils import ( remove_end, int_or_none, ExtractorError, - sanitized_Request, ) @@ -22,7 +21,7 @@ class TwitterBaseIE(InfoExtractor): class TwitterCardIE(TwitterBaseIE): IE_NAME = 'twitter:card' - _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos/tweet)/(?P<id>\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -30,7 +29,7 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', - 'title': 'TwitterCard', + 'title': 'Twitter Card', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 30.033, } @@ -41,7 +40,7 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '623160978427936768', 'ext': 'mp4', - 'title': 'TwitterCard', + 'title': 'Twitter Card', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 80.155, }, @@ -72,63 +71,102 @@ class TwitterCardIE(TwitterBaseIE): 'title': 'Vine by ArsenalTerje', }, 'add_ie': ['Vine'], - } + }, { + 'url': 'https://twitter.com/i/videos/tweet/705235433198714880', + 'md5': '3846d0a07109b5ab622425449b59049d', + 'info_dict': { + 'id': '705235433198714880', + 'ext': 'mp4', + 'title': 'Twitter web player', + 'thumbnail': 're:^https?://.*\.jpg', + }, + }, ] def _real_extract(self, url): video_id = self._match_id(url) - # Different formats served for different User-Agents - USER_AGENTS = [ - 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', # mp4 - 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', # webm - ] - config = None formats = [] - for user_agent in USER_AGENTS: - request = sanitized_Request(url) - request.add_header('User-Agent', user_agent) - webpage = self._download_webpage(request, video_id) + duration = None - iframe_url = self._html_search_regex( - r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', - webpage, 'video iframe', default=None) - if iframe_url: - return self.url_result(iframe_url) + webpage = self._download_webpage(url, video_id) - config = self._parse_json(self._html_search_regex( - r'data-player-config="([^"]+)"', webpage, 'data player config'), - video_id) - if 'playlist' not in config: - if 'vmapUrl' in config: - formats.append({ - 'url': self._get_vmap_video_url(config['vmapUrl'], video_id), - }) - break # same video regardless of UA - continue + iframe_url = self._html_search_regex( + r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', + webpage, 'video iframe', default=None) + if iframe_url: + return self.url_result(iframe_url) - video_url = config['playlist'][0]['source'] + config = self._parse_json(self._html_search_regex( + r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'), + video_id) + + def _search_dimensions_in_video_url(a_format, video_url): + m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) + if m: + a_format.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + + playlist = config.get('playlist') + if playlist: + video_url = playlist[0]['source'] f = { 'url': video_url, } - m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) - if m: - f.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) + _search_dimensions_in_video_url(f, video_url) + formats.append(f) + + vmap_url = config.get('vmapUrl') or config.get('vmap_url') + if vmap_url: + formats.append({ + 'url': self._get_vmap_video_url(vmap_url, video_id), + }) + + media_info = None + + for entity in config.get('status', {}).get('entities', []): + if 'mediaInfo' in entity: + media_info = entity['mediaInfo'] + + if media_info: + for media_variant in media_info['variants']: + media_url = media_variant['url'] + if media_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) + elif media_url.endswith('.mpd'): + formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) + else: + vbr = int_or_none(media_variant.get('bitRate'), scale=1000) + a_format = { + 'url': media_url, + 'format_id': 'http-%d' % vbr if vbr else 'http', + 'vbr': vbr, + } + # Reported bitRate may be zero + if not a_format['vbr']: + del a_format['vbr'] + + _search_dimensions_in_video_url(a_format, media_url) + + formats.append(a_format) + + duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) + self._sort_formats(formats) - thumbnail = config.get('posterImageUrl') - duration = float_or_none(config.get('duration')) + title = self._search_regex(r'<title>([^<]+)', webpage, 'title') + thumbnail = config.get('posterImageUrl') or config.get('image_src') + duration = float_or_none(config.get('duration')) or duration return { 'id': video_id, - 'title': 'TwitterCard', + 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, @@ -142,7 +180,6 @@ class TwitterIE(InfoExtractor): _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', - # MD5 checksums are different in different places 'info_dict': { 'id': '643211948184596480', 'ext': 'mp4', @@ -153,6 +190,9 @@ class TwitterIE(InfoExtractor): 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', 'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', @@ -177,6 +217,36 @@ class TwitterIE(InfoExtractor): 'uploader_id': 'starwars', 'uploader': 'Star Wars', }, + }, { + 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', + 'info_dict': { + 'id': '705235433198714880', + 'ext': 'mp4', + 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.', + 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."', + 'uploader_id': 'BTNBrentYarina', + 'uploader': 'Brent Yarina', + }, + 'params': { + # The same video as https://twitter.com/i/videos/tweet/705235433198714880 + # Test case of TwitterCardIE + 'skip_download': True, + }, + }, { + 'url': 'https://twitter.com/jaydingeer/status/700207533655363584', + 'md5': '', + 'info_dict': { + 'id': '700207533655363584', + 'ext': 'mp4', + 'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'jay on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'thumbnail': 're:^https?://.*\.jpg', + 'uploader': 'jay', + 'uploader_id': 'jaydingeer', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): @@ -234,6 +304,15 @@ class TwitterIE(InfoExtractor): }) return info + if 'class="PlayableMedia' in webpage: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'TwitterCard', + 'url': '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid), + }) + + return info + raise ExtractorError('There\'s no video in this tweet.') diff --git a/youtube_dl/extractor/ubu.py b/youtube_dl/extractor/ubu.py index d50237758..1d52cbc98 100644 --- a/youtube_dl/extractor/ubu.py +++ b/youtube_dl/extractor/ubu.py @@ -10,7 +10,7 @@ from ..utils import ( class UbuIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ubu\.com/film/(?P[\da-z_-]+)\.html' + _VALID_URL = r'https?://(?:www\.)?ubu\.com/film/(?P[\da-z_-]+)\.html' _TEST = { 'url': 'http://ubu.com/film/her_noise.html', 'md5': '138d5652618bf0f03878978db9bef1ee', diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index f5b5e7fd6..a9046b865 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -5,6 +5,7 @@ from ..compat import ( compat_HTTPError, compat_urllib_parse, compat_urllib_request, + compat_urlparse, ) from ..utils import ( ExtractorError, @@ -17,7 +18,16 @@ from ..utils import ( class UdemyIE(InfoExtractor): IE_NAME = 'udemy' - _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + www\.udemy\.com/ + (?: + [^#]+\#/lecture/| + lecture/view/?\?lectureId=| + [^/]+/learn/v4/t/lecture/ + ) + (?P\d+) + ''' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' _NETRC_MACHINE = 'udemy' @@ -33,9 +43,13 @@ class UdemyIE(InfoExtractor): 'duration': 579.29, }, 'skip': 'Requires udemy account credentials', + }, { + # new URL schema + 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906', + 'only_matching': True, }] - def _enroll_course(self, webpage, course_id): + def _enroll_course(self, base_url, webpage, course_id): checkout_url = unescapeHTML(self._search_regex( r'href=(["\'])(?Phttps?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1', webpage, 'checkout url', group='url', default=None)) @@ -45,9 +59,11 @@ class UdemyIE(InfoExtractor): 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True) enroll_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?Phttps?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', + r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1', webpage, 'enroll url', group='url', default=None)) if enroll_url: + if not enroll_url.startswith('http'): + enroll_url = compat_urlparse.urljoin(base_url, enroll_url) webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) @@ -144,14 +160,15 @@ class UdemyIE(InfoExtractor): webpage = self._download_webpage(url, lecture_id) course_id = self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id') + (r'data-course-id=["\'](\d+)', r'"id"\s*:\s*(\d+)'), + webpage, 'course id') try: lecture = self._download_lecture(course_id, lecture_id) except ExtractorError as e: # Error could possibly mean we are not enrolled in the course if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) lecture = self._download_lecture(course_id, lecture_id) else: raise @@ -176,39 +193,57 @@ class UdemyIE(InfoExtractor): video_id = asset['id'] thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') duration = float_or_none(asset.get('data', {}).get('duration')) - outputs = asset.get('data', {}).get('outputs', {}) formats = [] - for format_ in asset.get('download_urls', {}).get('Video', []): - video_url = format_.get('file') - if not video_url: - continue - format_id = format_.get('label') - f = { - 'url': format_['file'], - 'height': int_or_none(format_id), + + def extract_output_format(src): + return { + 'url': src['url'], + 'format_id': '%sp' % (src.get('label') or format_id), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), + 'vcodec': src.get('video_codec'), + 'fps': int_or_none(src.get('frame_rate')), + 'abr': int_or_none(src.get('audio_bitrate_in_kbps')), + 'acodec': src.get('audio_codec'), + 'asr': int_or_none(src.get('audio_sample_rate')), + 'tbr': int_or_none(src.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(src.get('file_size_in_bytes')), } - if format_id: - # Some videos contain additional metadata (e.g. - # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) - output = outputs.get(format_id) - if isinstance(output, dict): - f.update({ - 'format_id': '%sp' % (output.get('label') or format_id), - 'width': int_or_none(output.get('width')), - 'height': int_or_none(output.get('height')), - 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), - 'vcodec': output.get('video_codec'), - 'fps': int_or_none(output.get('frame_rate')), - 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), - 'acodec': output.get('audio_codec'), - 'asr': int_or_none(output.get('audio_sample_rate')), - 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), - 'filesize': int_or_none(output.get('file_size_in_bytes')), - }) - else: - f['format_id'] = '%sp' % format_id - formats.append(f) + + outputs = asset.get('data', {}).get('outputs') + if not isinstance(outputs, dict): + outputs = {} + + for format_id, output in outputs.items(): + if isinstance(output, dict) and output.get('url'): + formats.append(extract_output_format(output)) + + download_urls = asset.get('download_urls') + if isinstance(download_urls, dict): + video = download_urls.get('Video') + if isinstance(video, list): + for format_ in video: + video_url = format_.get('file') + if not video_url: + continue + format_id = format_.get('label') + f = { + 'url': format_['file'], + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + output = outputs.get(format_id) + if isinstance(output, dict): + output_format = extract_output_format(output) + output_format.update(f) + f = output_format + else: + f['format_id'] = '%sp' % format_id + formats.append(f) self._sort_formats(formats) @@ -243,7 +278,7 @@ class UdemyCourseIE(UdemyIE): course_id = response['id'] course_title = response.get('title') - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 594bee4f9..66d9f1bf3 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -7,7 +7,7 @@ from ..utils import qualities class UnistraIE(InfoExtractor): - _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P\d+)' + _VALID_URL = r'https?://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P\d+)' _TESTS = [ { diff --git a/youtube_dl/extractor/usatoday.py b/youtube_dl/extractor/usatoday.py new file mode 100644 index 000000000..e5678dc78 --- /dev/null +++ b/youtube_dl/extractor/usatoday.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + get_element_by_attribute, + parse_duration, + update_url_query, + ExtractorError, +) +from ..compat import compat_str + + +class USATodayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?usatoday\.com/(?:[^/]+/)*(?P[^?/#]+)' + _TEST = { + 'url': 'http://www.usatoday.com/media/cinematic/video/81729424/us-france-warn-syrian-regime-ahead-of-new-peace-talks/', + 'md5': '4d40974481fa3475f8bccfd20c5361f8', + 'info_dict': { + 'id': '81729424', + 'ext': 'mp4', + 'title': 'US, France warn Syrian regime ahead of new peace talks', + 'timestamp': 1457891045, + 'description': 'md5:7e50464fdf2126b0f533748d3c78d58f', + 'uploader_id': '29906170001', + 'upload_date': '20160313', + } + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/29906170001/38a9eecc-bdd8-42a3-ba14-95397e48b3f8_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(update_url_query(url, {'ajax': 'true'}), display_id) + ui_video_data = get_element_by_attribute('class', 'ui-video-data', webpage) + if not ui_video_data: + raise ExtractorError('no video on the webpage', expected=True) + video_data = self._parse_json(ui_video_data, display_id) + + return { + '_type': 'url_transparent', + 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_data['brightcove_id'], + 'id': compat_str(video_data['id']), + 'title': video_data['title'], + 'thumbnail': video_data.get('thumbnail'), + 'description': video_data.get('description'), + 'duration': parse_duration(video_data.get('length')), + 'ie_key': 'BrightcoveNew', + } diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index 3794bcded..b755dda90 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -13,7 +13,7 @@ from ..utils import ( class Vbox7IE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?vbox7\.com/play:(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P[^/]+)' _TEST = { 'url': 'http://vbox7.com/play:249bb972c2', 'md5': '99f65c0c9ef9b682b97313e052734c3f', diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 9633f7ffe..23ce0a0d1 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -12,7 +12,7 @@ from ..utils import ( class VeohIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P(?:v|yapi-)[\da-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P(?:v|yapi-)[\da-zA-Z]+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py index a0c59a2e0..cb64ae0bd 100644 --- a/youtube_dl/extractor/vesti.py +++ b/youtube_dl/extractor/vesti.py @@ -10,7 +10,7 @@ from .rutv import RUTVIE class VestiIE(InfoExtractor): IE_DESC = 'Вести.Ru' - _VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P.+)' + _VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 14e945d49..b11cd254c 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -20,6 +20,7 @@ class VGTVIE(XstreamIE): 'aftenbladet.no/tv': 'satv', 'fvn.no/fvntv': 'fvntv', 'aftenposten.no/webtv': 'aptv', + 'ap.vgtv.no/webtv': 'aptv', } _APP_NAME_TO_VENDOR = { @@ -35,7 +36,7 @@ class VGTVIE(XstreamIE): (?P %s ) - / + /? (?: \#!/(?:video|live)/| embed?.*id= @@ -107,19 +108,27 @@ class VGTVIE(XstreamIE): 'md5': 'fd828cd29774a729bf4d4425fe192972', 'info_dict': { 'id': '21039', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', 'duration': 66, 'timestamp': 1417002452, 'upload_date': '20141126', 'view_count': int, - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', 'only_matching': True, }, + { + 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -144,8 +153,6 @@ class VGTVIE(XstreamIE): if len(video_id) == 5: if appname == 'bttv': info = self._extract_video_info('btno', video_id) - elif appname == 'aptv': - info = self._extract_video_info('ap', video_id) streams = data['streamUrls'] stream_type = data.get('streamType') @@ -207,7 +214,7 @@ class VGTVIE(XstreamIE): class BTArticleIE(InfoExtractor): IE_NAME = 'bt:article' IE_DESC = 'Bergens Tidende Articles' - _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P[^/]+)-\d+\.html' + _VALID_URL = r'https?://(?:www\.)?bt\.no/(?:[^/]+/)+(?P[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', 'md5': '2acbe8ad129b3469d5ae51b1158878df', @@ -234,7 +241,7 @@ class BTArticleIE(InfoExtractor): class BTVestlendingenIE(InfoExtractor): IE_NAME = 'bt:vestlendingen' IE_DESC = 'Bergens Tidende - Vestlendingen' - _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P\d+)' _TESTS = [{ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 3db6286e4..46c785ae1 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -1,31 +1,37 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from .ooyala import OoyalaIE from ..utils import ExtractorError class ViceIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P.+)' + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P[^/?#&]+)' - _TESTS = [ - { - 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', - 'info_dict': { - 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', - 'ext': 'mp4', - 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', - 'duration': 725.983, - }, - 'params': { - # Requires ffmpeg (m3u8 manifest) - 'skip_download': True, - }, - }, { - 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', - 'only_matching': True, - } - ] + _TESTS = [{ + 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', + 'info_dict': { + 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', + 'ext': 'mp4', + 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', + 'duration': 725.983, + }, + 'params': { + # Requires ffmpeg (m3u8 manifest) + 'skip_download': True, + }, + }, { + 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', + 'only_matching': True, + }, { + 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229', + 'only_matching': True, + }, { + 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -38,3 +44,35 @@ class ViceIE(InfoExtractor): except ExtractorError: raise ExtractorError('The page doesn\'t contain a video', expected=True) return self.url_result(ooyala_url, ie='Ooyala') + + +class ViceShowIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P[^/?#&]+)' + + _TEST = { + 'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2', + 'info_dict': { + 'id': 'fuck-thats-delicious-2', + 'title': "Fuck, That's Delicious", + 'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.', + }, + 'playlist_count': 17, + } + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + + entries = [ + self.url_result(video_url, ViceIE.ie_key()) + for video_url, _ in re.findall( + r']+class="article-title"[^>]+data-id="\d+"[^>]*>\s*]+href="(%s.*?)"' + % ViceIE._VALID_URL, webpage)] + + title = self._search_regex( + r'(.+?)', webpage, 'title', default=None) + if title: + title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() + description = self._html_search_meta('description', webpage, 'description') + + return self.playlist_result(entries, show_id, title, description) diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index 2cd36508a..0f798711b 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -14,7 +14,7 @@ class VideoTtIE(InfoExtractor): _WORKING = False ID_NAME = 'video.tt' IE_DESC = 'video.tt - Your True Tube' - _VALID_URL = r'http://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P[\da-zA-Z]{9})' + _VALID_URL = r'https?://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P[\da-zA-Z]{9})' _TESTS = [{ 'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8', diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 315984bf9..03b9f1353 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -15,7 +15,7 @@ from ..utils import ( class ViideaIE(InfoExtractor): - _VALID_URL = r'''(?x)http://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: videolectures\.net| flexilearn\.viidea\.net| presentations\.ocwconsortium\.org| diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 433fc9914..e04b814c8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -176,13 +176,13 @@ class VikiIE(VikiBaseIE): }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', - 'md5': '216d1afdc0c64d1febc1e9f2bd4b864b', + 'md5': '63f8600c1da6f01b7640eee7eca4f1da', 'info_dict': { 'id': '50562v', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Poor Nastya [COMPLETE] - Episode 1', 'description': '', - 'duration': 607, + 'duration': 606, 'timestamp': 1274949505, 'upload_date': '20101213', 'uploader': 'ad14065n', diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 9f282a1da..71c30d2cd 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -73,15 +73,26 @@ class VimeoIE(VimeoBaseInfoExtractor): # _VALID_URL matches Vimeo URLs _VALID_URL = r'''(?x) - https?:// - (?:(?:www|(?Pplayer))\.)? - vimeo(?Ppro)?\.com/ - (?!channels/[^/?#]+/?(?:$|[?#])|album/) - (?:.*?/)? - (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)? - (?:videos?/)? - (?P[0-9]+) - /?(?:[?&].*)?(?:[#].*)?$''' + https?:// + (?: + (?: + www| + (?Pplayer) + ) + \. + )? + vimeo(?Ppro)?\.com/ + (?!channels/[^/?#]+/?(?:$|[?#])|(?:album|ondemand)/) + (?:.*?/)? + (?: + (?: + play_redirect_hls| + moogaloop\.swf)\?clip_id= + )? + (?:videos?/)? + (?P[0-9]+) + /?(?:[?&].*)?(?:[#].*)?$ + ''' IE_NAME = 'vimeo' _TESTS = [ { @@ -277,9 +288,8 @@ class VimeoIE(VimeoBaseInfoExtractor): def _real_extract(self, url): url, data = unsmuggle_url(url, {}) - headers = std_headers + headers = std_headers.copy() if 'http_headers' in data: - headers = headers.copy() headers.update(data['http_headers']) if 'Referer' not in headers: headers['Referer'] = url @@ -294,7 +304,7 @@ class VimeoIE(VimeoBaseInfoExtractor): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information - request = sanitized_Request(url, None, headers) + request = sanitized_Request(url, headers=headers) try: webpage = self._download_webpage(request, video_id) except ExtractorError as ee: @@ -498,6 +508,38 @@ class VimeoIE(VimeoBaseInfoExtractor): } +class VimeoOndemandIE(VimeoBaseInfoExtractor): + IE_NAME = 'vimeo:ondemand' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P[^/?#&]+)' + _TESTS = [{ + # ondemand video not available via https://vimeo.com/id + 'url': 'https://vimeo.com/ondemand/20704', + 'md5': 'c424deda8c7f73c1dfb3edd7630e2f35', + 'info_dict': { + 'id': '105442900', + 'ext': 'mp4', + 'title': 'המעבדה - במאי יותם פלדמן', + 'uploader': 'גם סרטים', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/gumfilms', + 'uploader_id': 'gumfilms', + }, + }, { + 'url': 'https://vimeo.com/ondemand/nazmaalik', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/ondemand/141692381', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/ondemand/thelastcolony/150274832', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key()) + + class VimeoChannelIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:channel' _VALID_URL = r'https://vimeo\.com/channels/(?P[^/?#]+)/?(?:$|[?#])' diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 670a438af..d560a4b5e 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -142,10 +142,10 @@ class VKIE(InfoExtractor): 'url': 'https://vk.com/video276849682_170681728', 'info_dict': { 'id': 'V3K4mi0SYkc', - 'ext': 'mp4', + 'ext': 'webm', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', - 'duration': 179, + 'duration': 178, 'upload_date': '20130116', 'uploader': "Children's Joy Foundation", 'uploader_id': 'thecjf', diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 9e2aa58bd..bd5545173 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -64,7 +64,7 @@ class VLiveIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) creator = self._html_search_regex( - r']+class="info_area"[^>]*>\s*]+class="name"[^>]*>([^<]+)', + r']+class="info_area"[^>]*>\s*]*>([^<]+)', webpage, 'creator', fatal=False) view_count = int_or_none(playinfo.get('meta', {}).get('count')) diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index 149e36467..10ca6acb1 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -15,7 +15,7 @@ from ..utils import ( class VubeIE(InfoExtractor): IE_NAME = 'vube' IE_DESC = 'Vube.com' - _VALID_URL = r'http://vube\.com/(?:[^/]+/)+(?P[\da-zA-Z]{10})\b' + _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P[\da-zA-Z]{10})\b' _TESTS = [ { diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index a6d9b5fee..eaa888f00 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -14,7 +14,7 @@ from ..utils import ( class VuClipIE(InfoExtractor): - _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P[0-9]+)' + _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P[0-9]+)' _TEST = { 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index 24efbd6e6..8b9488340 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -11,7 +11,7 @@ from ..utils import ( class WallaIE(InfoExtractor): - _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P\d+)/(?P.+)' + _VALID_URL = r'https?://vod\.walla\.co\.il/[^/]+/(?P\d+)/(?P.+)' _TEST = { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', 'info_dict': { diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 37cf3d309..5227bb5ad 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -12,7 +12,7 @@ from ..utils import ( class WatIE(InfoExtractor): - _VALID_URL = r'(?:wat:(?P\d{8})|http://www\.wat\.tv/video/(?P.*)-(?P.*?)_.*?\.html)' + _VALID_URL = r'(?:wat:(?P\d{8})|https?://www\.wat\.tv/video/(?P.*)-(?P.*?)_.*?\.html)' IE_NAME = 'wat.tv' _TESTS = [ { diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index a851578e0..31c904303 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -244,7 +244,7 @@ class WDRMobileIE(InfoExtractor): class WDRMausIE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P[^/?#]+)(?:/index\.php5|(?[^/?#]+)(?:/index\.php5|(?[A-Za-z0-9]+)' + _VALID_URL = r'https?://www\.weiqitv\.com/index/video_play\?videoId=(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3', diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 041ff6c55..828c03dc3 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -5,7 +5,7 @@ from .youtube import YoutubeIE class WimpIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.wimp.com/maruexhausted/', 'md5': 'ee21217ffd66d058e8b16be340b74883', @@ -20,7 +20,7 @@ class WimpIE(InfoExtractor): 'md5': '4e2986c793694b55b37cf92521d12bb4', 'info_dict': { 'id': 'clowncar', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'It\'s like a clown car.', 'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2', }, diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 41061dd31..8b14840a2 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, sanitized_Request, + int_or_none, ) @@ -18,6 +19,9 @@ class WistiaIE(InfoExtractor): 'id': 'sh7fpupwlt', 'ext': 'mov', 'title': 'Being Resourceful', + 'description': 'a Clients From Hell Video Series video from worldwidewebhosting', + 'upload_date': '20131204', + 'timestamp': 1386185018, 'duration': 117, }, } @@ -32,35 +36,43 @@ class WistiaIE(InfoExtractor): raise ExtractorError('Error while getting the playlist', expected=True) data = data_json['media'] + title = data['name'] formats = [] thumbnails = [] for a in data['assets']: + astatus = a.get('status') atype = a.get('type') - if atype == 'still': + if (astatus is not None and astatus != 2) or atype == 'preview': + continue + elif atype in ('still', 'still_image'): thumbnails.append({ 'url': a['url'], 'resolution': '%dx%d' % (a['width'], a['height']), }) - continue - if atype == 'preview': - continue - formats.append({ - 'format_id': atype, - 'url': a['url'], - 'width': a['width'], - 'height': a['height'], - 'filesize': a['size'], - 'ext': a['ext'], - 'preference': 1 if atype == 'original' else None, - }) + else: + formats.append({ + 'format_id': atype, + 'url': a['url'], + 'tbr': int_or_none(a.get('bitrate')), + 'vbr': int_or_none(a.get('opt_vbitrate')), + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), + 'filesize': int_or_none(a.get('size')), + 'vcodec': a.get('codec'), + 'container': a.get('container'), + 'ext': a.get('ext'), + 'preference': 1 if atype == 'original' else None, + }) self._sort_formats(formats) return { 'id': video_id, - 'title': data['name'], + 'title': title, + 'description': data.get('seoDescription'), 'formats': formats, 'thumbnails': thumbnails, - 'duration': data.get('duration'), + 'duration': int_or_none(data.get('duration')), + 'timestamp': int_or_none(data.get('createdAt')), } diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py index 4ff99e5ca..e4a2baad2 100644 --- a/youtube_dl/extractor/xbef.py +++ b/youtube_dl/extractor/xbef.py @@ -5,7 +5,7 @@ from ..compat import compat_urllib_parse_unquote class XBefIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xbef\.com/video/(?P[0-9]+)' _TEST = { 'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking', 'md5': 'a478b565baff61634a98f5e5338be995', diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index fd43e8854..b3547174d 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + dict_get, float_or_none, int_or_none, unified_strdate, @@ -170,6 +171,12 @@ class XHamsterEmbedIE(InfoExtractor): video_url = self._search_regex( r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, - webpage, 'xhamster url') + webpage, 'xhamster url', default=None) + + if not video_url: + vars = self._parse_json( + self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'), + video_id) + video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) return self.url_result(video_url, 'XHamster') diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py index 001ee17b6..63bbc0634 100644 --- a/youtube_dl/extractor/yam.py +++ b/youtube_dl/extractor/yam.py @@ -15,7 +15,7 @@ from ..utils import ( class YamIE(InfoExtractor): IE_DESC = '蕃薯藤yam天空部落' - _VALID_URL = r'http://mymedia.yam.com/m/(?P\d+)' + _VALID_URL = r'https?://mymedia.yam.com/m/(?P\d+)' _TESTS = [{ # An audio hosted on Yam diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index d3cc1a29f..e699e663f 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -10,13 +10,27 @@ from ..compat import ( compat_urllib_parse, ) from ..utils import ( + ExtractorError, int_or_none, float_or_none, sanitized_Request, ) -class YandexMusicTrackIE(InfoExtractor): +class YandexMusicBaseIE(InfoExtractor): + @staticmethod + def _handle_error(response): + error = response.get('error') + if error: + raise ExtractorError(error, expected=True) + + def _download_json(self, *args, **kwargs): + response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs) + self._handle_error(response) + return response + + +class YandexMusicTrackIE(YandexMusicBaseIE): IE_NAME = 'yandexmusic:track' IE_DESC = 'Яндекс.Музыка - Трек' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/track/(?P\d+)' @@ -73,7 +87,7 @@ class YandexMusicTrackIE(InfoExtractor): return self._get_track_info(track) -class YandexMusicPlaylistBaseIE(InfoExtractor): +class YandexMusicPlaylistBaseIE(YandexMusicBaseIE): def _build_playlist(self, tracks): return [ self.url_result( diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 869f3e819..2522551dc 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -9,7 +9,7 @@ from ..compat import compat_urllib_parse_unquote_plus class YnetIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?PL(?:-[0-9]+)+),00\.html' + _VALID_URL = r'https?://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?PL(?:-[0-9]+)+),00\.html' _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index b29baafc4..1124fe6c2 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -75,7 +75,7 @@ class YouPornIE(InfoExtractor): links = [] sources = self._search_regex( - r'sources\s*:\s*({.+?})', webpage, 'sources', default=None) + r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None) if sources: for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources): links.append(link) @@ -101,8 +101,9 @@ class YouPornIE(InfoExtractor): } # Video URL's path looks like this: # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # We will benefit from it by extracting some metadata - mobj = re.search(r'/(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+/', video_url) + mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+/', video_url) if mobj: height = int(mobj.group('height')) bitrate = int(mobj.group('bitrate')) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 27e67feb4..96fa3b5aa 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -309,6 +309,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, @@ -1910,7 +1911,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) + return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) + else super(YoutubeChannelIE, cls).suitable(url)) def _real_extract(self, url): channel_id = self._match_id(url) @@ -1985,6 +1987,51 @@ class YoutubeUserIE(YoutubeChannelIE): return super(YoutubeUserIE, cls).suitable(url) +class YoutubeLiveIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com live streams' + _VALID_URL = r'(?Phttps?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P[^/]+))/live' + IE_NAME = 'youtube:live' + + _TESTS = [{ + 'url': 'http://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + base_url = mobj.group('base_url') + webpage = self._download_webpage(url, channel_id, fatal=False) + if webpage: + page_type = self._og_search_property( + 'type', webpage, 'page type', default=None) + video_id = self._html_search_meta( + 'videoId', webpage, 'video id', default=None) + if page_type == 'video' and video_id and re.match(r'^[0-9A-Za-z_-]{11}$', video_id): + return self.url_result(video_id, YoutubeIE.ie_key()) + return self.url_result(base_url) + + class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P[^/]+)/playlists' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 9dd7a8034..7819f14ab 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -399,6 +399,10 @@ def parseOpts(overrideArguments=None): '-R', '--retries', dest='retries', metavar='RETRIES', default=10, help='Number of retries (default is %default), or "infinite".') + downloader.add_option( + '--fragment-retries', + dest='fragment_retries', metavar='RETRIES', default=10, + help='Number of retries for a fragment (default is %default), or "infinite" (DASH only)') downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', @@ -720,7 +724,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--embed-subs', action='store_true', dest='embedsubtitles', default=False, - help='Embed subtitles in the video (only for mkv and mp4 videos)') + help='Embed subtitles in the video (only for mp4, webm and mkv videos)') postproc.add_option( '--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False, diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py index 0d8ef6ca2..3ea518399 100644 --- a/youtube_dl/postprocessor/__init__.py +++ b/youtube_dl/postprocessor/__init__.py @@ -6,6 +6,7 @@ from .ffmpeg import ( FFmpegEmbedSubtitlePP, FFmpegExtractAudioPP, FFmpegFixupStretchedPP, + FFmpegFixupM3u8PP, FFmpegFixupM4aPP, FFmpegMergerPP, FFmpegMetadataPP, @@ -26,6 +27,7 @@ __all__ = [ 'ExecAfterDownloadPP', 'FFmpegEmbedSubtitlePP', 'FFmpegExtractAudioPP', + 'FFmpegFixupM3u8PP', 'FFmpegFixupM4aPP', 'FFmpegFixupStretchedPP', 'FFmpegMergerPP', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 380bc6f29..06b8c0548 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -25,6 +25,19 @@ from ..utils import ( ) +EXT_TO_OUT_FORMATS = { + "aac": "adts", + "m4a": "ipod", + "mka": "matroska", + "mkv": "matroska", + "mpg": "mpeg", + "ogv": "ogg", + "ts": "mpegts", + "wma": "asf", + "wmv": "asf", +} + + class FFmpegPostProcessorError(PostProcessingError): pass @@ -318,17 +331,34 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): def run(self, information): - if information['ext'] not in ['mp4', 'mkv']: - self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files') + if information['ext'] not in ('mp4', 'webm', 'mkv'): + self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4, webm or mkv files') return [], information subtitles = information.get('requested_subtitles') if not subtitles: self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') return [], information - sub_langs = list(subtitles.keys()) filename = information['filepath'] - sub_filenames = [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()] + + ext = information['ext'] + sub_langs = [] + sub_filenames = [] + webm_vtt_warn = False + + for lang, sub_info in subtitles.items(): + sub_ext = sub_info['ext'] + if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': + sub_langs.append(lang) + sub_filenames.append(subtitles_filename(filename, lang, sub_ext)) + else: + if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': + webm_vtt_warn = True + self._downloader.to_screen('[ffmpeg] Only WebVTT subtitles can be embedded in webm files') + + if not sub_langs: + return [], information + input_files = [filename] + sub_filenames opts = [ @@ -391,10 +421,6 @@ class FFmpegMetadataPP(FFmpegPostProcessor): for (name, value) in metadata.items(): options.extend(['-metadata', '%s=%s' % (name, value)]) - # https://github.com/rg3/youtube-dl/issues/8350 - if info.get('protocol') == 'm3u8_native' or info.get('protocol') == 'm3u8' and self._downloader.params.get('hls_prefer_native', False): - options.extend(['-bsf:a', 'aac_adtstoasc']) - self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename) self.run_ffmpeg(filename, temp_filename, options) os.remove(encodeFilename(filename)) @@ -467,6 +493,21 @@ class FFmpegFixupM4aPP(FFmpegPostProcessor): return [], info +class FFmpegFixupM3u8PP(FFmpegPostProcessor): + def run(self, info): + filename = info['filepath'] + temp_filename = prepend_extension(filename, 'temp') + + options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename) + self.run_ffmpeg(filename, temp_filename, options) + + os.remove(encodeFilename(filename)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + + return [], info + + class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): def __init__(self, downloader=None, format=None): super(FFmpegSubtitlesConvertorPP, self).__init__(downloader) diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index 480d48d05..e39ca60aa 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -6,6 +6,7 @@ import sys import errno from .common import PostProcessor +from ..compat import compat_os_name from ..utils import ( check_executable, hyphenate_date, @@ -73,7 +74,7 @@ class XAttrMetadataPP(PostProcessor): raise XAttrMetadataError(e.errno, e.strerror) except ImportError: - if os.name == 'nt': + if compat_os_name == 'nt': # Write xattrs to NTFS Alternate Data Streams: # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 def write_xattr(path, key, value): @@ -168,7 +169,7 @@ class XAttrMetadataPP(PostProcessor): 'Unable to write extended attributes due to too long values.') else: msg = 'This filesystem doesn\'t support extended attributes. ' - if os.name == 'nt': + if compat_os_name == 'nt': msg += 'You need to use NTFS.' else: msg += '(You may have to enable them in your /etc/fstab)' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 517c4aee1..e87b174b3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParser, compat_basestring, compat_chr, compat_etree_fromstring, @@ -49,6 +50,7 @@ from .compat import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xpath, shlex_quote, ) @@ -160,18 +162,11 @@ if sys.version_info >= (2, 7): def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z_-]+$', key) - if val: - assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) return node.find(expr) else: def find_xpath_attr(node, xpath, key, val=None): - # Here comes the crazy part: In 2.6, if the xpath is a unicode, - # .//node does not match if a node is a direct child of . ! - if isinstance(xpath, compat_str): - xpath = xpath.encode('ascii') - - for f in node.findall(xpath): + for f in node.findall(compat_xpath(xpath)): if key not in f.attrib: continue if val is None or f.attrib.get(key) == val: @@ -196,9 +191,7 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def _find_xpath(xpath): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') - return node.find(xpath) + return node.find(compat_xpath(xpath)) if isinstance(xpath, (str, compat_str)): n = _find_xpath(xpath) @@ -275,6 +268,38 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = {} + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + + +def extract_attributes(html_element): + """Given a string for an HTML element such as + + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + parser.feed(html_element) + parser.close() + return parser.attrs + + def clean_html(html): """Clean an HTML snippet into a readable string""" @@ -467,6 +492,10 @@ def encodeFilename(s, for_subprocess=False): if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: return s + # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible + if sys.platform.startswith('java'): + return s + return s.encode(get_subprocess_encoding(), 'ignore') @@ -1217,13 +1246,23 @@ if sys.platform == 'win32': raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) else: - import fcntl + # Some platforms, such as Jython, is missing fcntl + try: + import fcntl - def _lock_file(f, exclusive): - fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + def _lock_file(f, exclusive): + fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) - def _unlock_file(f): - fcntl.flock(f, fcntl.LOCK_UN) + def _unlock_file(f): + fcntl.flock(f, fcntl.LOCK_UN) + except ImportError: + UNSUPPORTED_MSG = 'file locking is not supported on this platform' + + def _lock_file(f, exclusive): + raise IOError(UNSUPPORTED_MSG) + + def _unlock_file(f): + raise IOError(UNSUPPORTED_MSG) class locked_file(object): @@ -1304,6 +1343,17 @@ def format_bytes(bytes): return '%.2f%s' % (converted, suffix) +def lookup_unit_table(unit_table, s): + units_re = '|'.join(re.escape(u) for u in unit_table) + m = re.match( + r'(?P[0-9]+(?:[,.][0-9]*)?)\s*(?P%s)\b' % units_re, s) + if not m: + return None + num_str = m.group('num').replace(',', '.') + mult = unit_table[m.group('unit')] + return int(float(num_str) * mult) + + def parse_filesize(s): if s is None: return None @@ -1347,15 +1397,28 @@ def parse_filesize(s): 'Yb': 1000 ** 8, } - units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE) - m = re.match( - r'(?P[0-9]+(?:[,.][0-9]*)?)\s*(?P%s)' % units_re, s) - if not m: + return lookup_unit_table(_UNIT_TABLE, s) + + +def parse_count(s): + if s is None: return None - num_str = m.group('num').replace(',', '.') - mult = _UNIT_TABLE[m.group('unit')] - return int(float(num_str) * mult) + s = s.strip() + + if re.match(r'^[\d,.]+$', s): + return str_to_int(s) + + _UNIT_TABLE = { + 'k': 1000, + 'K': 1000, + 'm': 1000 ** 2, + 'M': 1000 ** 2, + 'kk': 1000 ** 2, + 'KK': 1000 ** 2, + } + + return lookup_unit_table(_UNIT_TABLE, s) def month_by_name(name): @@ -1387,6 +1450,12 @@ def fix_xml_ampersands(xml_str): def setproctitle(title): assert isinstance(title, compat_str) + + # ctypes in Jython is not complete + # http://bugs.jython.org/issue2148 + if sys.platform.startswith('java'): + return + try: libc = ctypes.cdll.LoadLibrary('libc.so.6') except OSError: @@ -1677,6 +1746,7 @@ def escape_url(url): """Escape URL as suggested by RFC 3986""" url_parsed = compat_urllib_parse_urlparse(url) return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), path=escape_rfc3986(url_parsed.path), params=escape_rfc3986(url_parsed.params), query=escape_rfc3986(url_parsed.query), @@ -1686,7 +1756,8 @@ def escape_url(url): try: struct.pack('!I', 0) except TypeError: - # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument + # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument + # See https://bugs.python.org/issue19099 def struct_pack(spec, *args): if isinstance(spec, compat_str): spec = spec.encode('ascii') @@ -1721,6 +1792,15 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') +def update_url_query(url, query): + parsed_url = compat_urlparse.urlparse(url) + qs = compat_parse_qs(parsed_url.query) + qs.update(query) + qs = encode_dict(qs) + return compat_urlparse.urlunparse(parsed_url._replace( + query=compat_urllib_parse.urlencode(qs, True))) + + def encode_dict(d, encoding='utf-8'): def encode(v): return v.encode(encoding) if isinstance(v, compat_basestring) else v diff --git a/youtube_dl/version.py b/youtube_dl/version.py index adafd601b..2291ed783 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.03.01' +__version__ = '2016.03.25'