diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c208eb689..a26ff1de4 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.24** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.05.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.05.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.04.24 +[debug] youtube-dl version 2016.05.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/AUTHORS b/AUTHORS index 07cade723..5ca71ace7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -168,3 +168,7 @@ José Joaquín Atria Viťas Strádal Kagami Hiiragi Philip Huppert +blahgeek +Kevin Deldycke +inondle +Tomáš Čech diff --git a/Makefile b/Makefile index 06cffcb71..c9ce216d1 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi *.mkv *.webm CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete find . -name "*.class" -delete diff --git a/README.md b/README.md index ecf737047..50acb26a0 100644 --- a/README.md +++ b/README.md @@ -465,7 +465,7 @@ The basic usage is not to set any template arguments when downloading a single f - `display_id`: An alternative identifier for the video - `uploader`: Full name of the video uploader - `license`: License name the video is licensed under - - `creator`: The main artist who created the video + - `creator`: The creator of the video - `release_date`: The date (YYYYMMDD) when the video was released - `timestamp`: UNIX timestamp of the moment the video became available - `upload_date`: Video upload date (YYYYMMDD) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 03875b8db..9fb43671f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -338,7 +338,6 @@ - **mailru**: Видео@Mail.Ru - **MakersChannel** - **MakerTV** - - **Malemotion** - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** @@ -375,8 +374,8 @@ - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** - - **muzu.tv** - **Mwave** + - **MwaveMeetGreet** - **MySpace** - **MySpace:album** - **MySpass** @@ -554,7 +553,6 @@ - **SenateISVP** - **ServingSys** - **Sexu** - - **SexyKarma**: Sexy Karma and Watch Indian Porn - **Shahid** - **Shared**: shared.sx and vivo.sx - **ShareSix** @@ -567,8 +565,6 @@ - **smotri:broadcast**: Smotri.com broadcasts - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos - - **SnagFilms** - - **SnagFilmsEmbed** - **Snotr** - **Sohu** - **soundcloud** @@ -610,6 +606,7 @@ - **Syfy** - **SztvHu** - **Tagesschau** + - **tagesschau:player** - **Tapely** - **Tass** - **TDSLifeway** @@ -725,6 +722,8 @@ - **Vidzi** - **vier** - **vier:videos** + - **ViewLift** + - **ViewLiftEmbed** - **Viewster** - **Viidea** - **viki** @@ -756,6 +755,7 @@ - **Walla** - **WashingtonPost** - **wat.tv** + - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile** - **WDRMaus**: Sendung mit der Maus @@ -775,6 +775,10 @@ - **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me - **XHamster** - **XHamsterEmbed** + - **xiami:album**: 虾米音乐 - 专辑 + - **xiami:artist**: 虾米音乐 - 歌手 + - **xiami:collection**: 虾米音乐 - 精选集 + - **xiami:song**: 虾米音乐 - **XMinus** - **XNXX** - **Xstream** diff --git a/test/test_compat.py b/test/test_compat.py index 618668210..9adf75763 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -10,9 +10,9 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, + compat_setenv, compat_etree_fromstring, compat_expanduser, compat_shlex_split, @@ -26,19 +26,22 @@ from youtube_dl.compat import ( class TestCompat(unittest.TestCase): def test_compat_getenv(self): test_str = 'тест' - os.environ['YOUTUBE-DL-TEST'] = ( - test_str if sys.version_info >= (3, 0) - else test_str.encode(get_filesystem_encoding())) + compat_setenv('YOUTUBE-DL-TEST', test_str) self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str) + def test_compat_setenv(self): + test_var = 'YOUTUBE-DL-TEST' + test_str = 'тест' + compat_setenv(test_var, test_str) + compat_getenv(test_var) + self.assertEqual(compat_getenv(test_var), test_str) + def test_compat_expanduser(self): old_home = os.environ.get('HOME') test_str = 'C:\Documents and Settings\тест\Application Data' - os.environ['HOME'] = ( - test_str if sys.version_info >= (3, 0) - else test_str.encode(get_filesystem_encoding())) + compat_setenv('HOME', test_str) self.assertEqual(compat_expanduser('~'), test_str) - os.environ['HOME'] = old_home + compat_setenv('HOME', old_home or '') def test_all_present(self): import youtube_dl.compat diff --git a/test/test_utils.py b/test/test_utils.py index e16a6761b..00ada95ec 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -139,8 +139,8 @@ class TestUtil(unittest.TestCase): self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True)) self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True)) - tests = 'a\xe4b\u4e2d\u56fd\u7684c' - self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c') + tests = 'aäb\u4e2d\u56fd\u7684c' + self.assertEqual(sanitize_filename(tests, restricted=True), 'aab_c') self.assertTrue(sanitize_filename('\xf6', restricted=True) != '') # No empty filename forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#' @@ -155,6 +155,10 @@ class TestUtil(unittest.TestCase): self.assertTrue(sanitize_filename('-', restricted=True) != '') self.assertTrue(sanitize_filename(':', restricted=True) != '') + self.assertEqual(sanitize_filename( + 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True), + 'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy') + def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 055433362..a96482e68 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -580,7 +580,7 @@ class YoutubeDL(object): is_id=(k == 'id')) template_dict = dict((k, sanitize(k, v)) for k, v in template_dict.items() - if v is not None) + if v is not None and not isinstance(v, (list, tuple, dict))) template_dict = collections.defaultdict(lambda: 'NA', template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) @@ -1639,7 +1639,7 @@ class YoutubeDL(object): # Just a single file success = dl(filename, info_dict) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_error('unable to download video data: %s' % str(err)) + self.report_error('unable to download video data: %s' % error_to_compat_str(err)) return except (OSError, IOError) as err: raise UnavailableVideoError(err) @@ -2018,6 +2018,7 @@ class YoutubeDL(object): if opts_cookiefile is None: self.cookiejar = compat_cookiejar.CookieJar() else: + opts_cookiefile = compat_expanduser(opts_cookiefile) self.cookiejar = compat_cookiejar.MozillaCookieJar( opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 737f6545d..cbd84c3af 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -86,7 +86,9 @@ def _real_main(argv=None): if opts.batchfile == '-': batchfd = sys.stdin else: - batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batchfd = io.open( + compat_expanduser(opts.batchfile), + 'r', encoding='utf-8', errors='ignore') batch_urls = read_batch_urls(batchfd) if opts.verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') @@ -404,7 +406,7 @@ def _real_main(argv=None): try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(opts.load_info_filename) + retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename)) else: retcode = ydl.download(all_urls) except MaxDownloadsReached: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0b6c5ca7a..12b53cdc8 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -373,6 +373,9 @@ compat_os_name = os._name if os.name == 'java' else os.name if sys.version_info >= (3, 0): compat_getenv = os.getenv compat_expanduser = os.path.expanduser + + def compat_setenv(key, value, env=os.environ): + env[key] = value else: # Environment variables should be decoded with filesystem encoding. # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) @@ -384,6 +387,12 @@ else: env = env.decode(get_filesystem_encoding()) return env + def compat_setenv(key, value, env=os.environ): + def encode(v): + from .utils import get_filesystem_encoding + return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v + env[encode(key)] = encode(value) + # HACK: The default implementations of os.path.expanduser from cpython do not decode # environment variables with filesystem encoding. We will work around this by # providing adjusted implementations. @@ -604,6 +613,7 @@ __all__ = [ 'compat_os_name', 'compat_parse_qs', 'compat_print', + 'compat_setenv', 'compat_shlex_split', 'compat_socket_create_connection', 'compat_str', diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 8d642fc3e..3a73cee1c 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -6,6 +6,7 @@ import sys import re from .common import FileDownloader +from ..compat import compat_setenv from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS from ..utils import ( cli_option, @@ -198,6 +199,18 @@ class FFmpegFD(ExternalFD): '-headers', ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + env = None + proxy = self.params.get('proxy') + if proxy: + if not re.match(r'^[\da-zA-Z]+://', proxy): + proxy = 'http://%s' % proxy + # Since December 2015 ffmpeg supports -http_proxy option (see + # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) + # We could switch to the following code if we are able to detect version properly + # args += ['-http_proxy', proxy] + env = os.environ.copy() + compat_setenv('HTTP_PROXY', proxy, env=env) + protocol = info_dict.get('protocol') if protocol == 'rtmp': @@ -224,7 +237,7 @@ class FFmpegFD(ExternalFD): args += ['-rtmp_live', 'live'] args += ['-i', url, '-c', 'copy'] - if protocol == 'm3u8': + if protocol in ('m3u8', 'm3u8_native'): if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] else: @@ -239,7 +252,7 @@ class FFmpegFD(ExternalFD): self._debug_cmd(args) - proc = subprocess.Popen(args, stdin=subprocess.PIPE) + proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) try: retval = proc.wait() except KeyboardInterrupt: diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a01dac031..62136ee54 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -4,6 +4,7 @@ import os.path import re from .fragment import FragmentFD +from .external import FFmpegFD from ..compat import compat_urlparse from ..utils import ( @@ -17,12 +18,39 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' + @staticmethod + def can_download(manifest): + UNSUPPORTED_FEATURES = ( + r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1] + r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] + # Live streams heuristic does not always work (e.g. geo restricted to Germany + # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) + # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of + # event media playlists [4] + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 + # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 + ) + return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) + def real_download(self, filename, info_dict): man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) manifest = self.ydl.urlopen(man_url).read() s = manifest.decode('utf-8', 'ignore') + + if not self.can_download(s): + self.report_warning( + 'hlsnative has detected features it does not support, ' + 'extraction will be delegated to ffmpeg') + fd = FFmpegFD(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + return fd.real_download(filename, info_dict) + fragment_urls = [] for line in s.splitlines(): line = line.strip() diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index 24df8fe93..42c21bf41 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -12,7 +12,7 @@ from ..utils import ( class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/.*-)(?P[^/?-]+)' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/(?:[^/]+/)*(?:[^/?#&]+-)?)(?P[^/?#&]+)' _TESTS = [{ # video with 5min ID @@ -53,6 +53,12 @@ class AolIE(InfoExtractor): }, { 'url': 'http://on.aol.com/shows/park-bench-shw518173474-559a1b9be4b0c3bfad3357a7?context=SH:SHW518173474:PL4327:1460619712763', 'only_matching': True, + }, { + 'url': 'http://on.aol.com/video/519442220', + 'only_matching': True, + }, { + 'url': 'aol-video:5707d6b8e4b090497b04f706', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 881cacfab..e37fdae13 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -161,24 +161,53 @@ class ArteTVPlus7IE(InfoExtractor): 'es': 'E[ESP]', } + langcode = LANGS.get(lang, lang) + formats = [] for format_id, format_dict in player_info['VSR'].items(): f = dict(format_dict) versionCode = f.get('versionCode') - langcode = LANGS.get(lang, lang) - lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] - lang_pref = None - if versionCode: - matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] - lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) - source_pref = 0 - if versionCode is not None: - # The original version with subtitles has lower relevance - if re.match(r'VO-ST(F|A|E)', versionCode): - source_pref -= 10 - # The version with sourds/mal subtitles has also lower relevance - elif re.match(r'VO?(F|A|E)-STM\1', versionCode): - source_pref -= 9 + l = re.escape(langcode) + + # Language preference from most to least priority + # Reference: section 5.6.3 of + # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf + PREFERENCES = ( + # original version in requested language, without subtitles + r'VO{0}$'.format(l), + # original version in requested language, with partial subtitles in requested language + r'VO{0}-ST{0}$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO{0}-STM{0}$'.format(l), + # non-original (dubbed) version in requested language, without subtitles + r'V{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language + r'V{0}-ST{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'V{0}-STM{0}$'.format(l), + # original version in requested language, with partial subtitles in different language + r'VO{0}-ST(?!{0}).+?$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language + r'VO{0}-STM(?!{0}).+?$'.format(l), + # original version in different language, with partial subtitles in requested language + r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), + # original version in different language, without subtitles + r'VO(?:(?!{0}))?$'.format(l), + # original version in different language, with partial subtitles in different language + r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in different language + r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), + ) + + for pref, p in enumerate(PREFERENCES): + if re.match(p, versionCode): + lang_pref = len(PREFERENCES) - pref + break + else: + lang_pref = -1 + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -188,7 +217,6 @@ class ArteTVPlus7IE(InfoExtractor): 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), 'quality': qfunc(f.get('quality')), - 'source_preference': source_pref, } if f.get('mediaType') == 'rtmp': diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py new file mode 100644 index 000000000..ae4579b33 --- /dev/null +++ b/youtube_dl/extractor/biqle.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BIQLEIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P-?\d+_\d+)' + _TESTS = [{ + 'url': 'http://www.biqle.ru/watch/847655_160197695', + 'md5': 'ad5f746a874ccded7b8f211aeea96637', + 'info_dict': { + 'id': '160197695', + 'ext': 'mp4', + 'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)', + 'uploader': 'Andrey Rogozin', + 'upload_date': '20110605', + } + }, { + 'url': 'https://biqle.org/watch/-44781847_168547604', + 'md5': '7f24e72af1db0edf7c1aaba513174f97', + 'info_dict': { + 'id': '168547604', + 'ext': 'mp4', + 'title': 'Ребенок в шоке от автоматической мойки', + 'uploader': 'Dmitry Kotov', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + embed_url = self._proto_relative_url(self._search_regex( + r'', webpage, 'embed url')) + + return { + '_type': 'url_transparent', + 'url': embed_url, + } diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index dda2c0959..8f7f09e22 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -1,13 +1,9 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, - parse_duration, - qualities, - unified_strdate, + parse_iso8601, ) @@ -19,14 +15,14 @@ class CCCIE(InfoExtractor): 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video', 'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'info_dict': { - 'id': '30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor', + 'id': '1839', 'ext': 'mp4', 'title': 'Introduction to Processor Design', - 'description': 'md5:80be298773966f66d56cb11260b879af', + 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac', 'thumbnail': 're:^https?://.*\.jpg$', - 'view_count': int, 'upload_date': '20131228', - 'duration': 3660, + 'timestamp': 1388188800, + 'duration': 3710, } }, { 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', @@ -34,79 +30,48 @@ class CCCIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + event_id = self._search_regex("data-id='(\d+)'", webpage, 'event id') + event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id) - if self._downloader.params.get('prefer_free_formats'): - preference = qualities(['mp3', 'opus', 'mp4-lq', 'webm-lq', 'h264-sd', 'mp4-sd', 'webm-sd', 'mp4', 'webm', 'mp4-hd', 'h264-hd', 'webm-hd']) - else: - preference = qualities(['opus', 'mp3', 'webm-lq', 'mp4-lq', 'webm-sd', 'h264-sd', 'mp4-sd', 'webm', 'mp4', 'webm-hd', 'mp4-hd', 'h264-hd']) - - title = self._html_search_regex( - r'(?s)

(.*?)

', webpage, 'title') - description = self._html_search_regex( - r'(?s)

About

(.+?)

', - webpage, 'description', fatal=False) - upload_date = unified_strdate(self._html_search_regex( - r"(?s)]+class='[^']*fa-calendar-o'[^>]*>(.+?)", - webpage, 'upload date', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r"(?s)(.*?)", - webpage, 'view count', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'(?s)]+class=(["\']).*?fa-clock-o.*?\1[^>]*>(?P.+?)(?P[^<]*)\s* - <(?:span|div)\s+class='label\s+filetype'>(?P[^<]*)\s* - [^']+)'>\s* - (?: - .*? - [^']+\.torrent)' - )?''', webpage) formats = [] - for m in matches: - format = m.group('format') - format_id = self._search_regex( - r'.*/([a-z0-9_-]+)/[^/]*$', - m.group('http_url'), 'format id', default=None) - if format_id: - format_id = m.group('lang') + '-' + format_id - vcodec = 'h264' if 'h264' in format_id else ( - 'none' if format_id in ('mp3', 'opus') else None + for recording in event_data.get('recordings', []): + recording_url = recording.get('recording_url') + if not recording_url: + continue + language = recording.get('language') + folder = recording.get('folder') + format_id = None + if language: + format_id = language + if folder: + if language: + format_id += '-' + folder + else: + format_id = folder + vcodec = 'h264' if 'h264' in folder else ( + 'none' if folder in ('mp3', 'opus') else None ) formats.append({ 'format_id': format_id, - 'format': format, - 'language': m.group('lang'), - 'url': m.group('http_url'), + 'url': recording_url, + 'width': int_or_none(recording.get('width')), + 'height': int_or_none(recording.get('height')), + 'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024), + 'language': language, 'vcodec': vcodec, - 'preference': preference(format_id), }) - - if m.group('torrent_url'): - formats.append({ - 'format_id': 'torrent-%s' % (format if format_id is None else format_id), - 'format': '%s (torrent)' % format, - 'proto': 'torrent', - 'format_note': '(unsupported; will just download the .torrent file)', - 'vcodec': vcodec, - 'preference': -100 + preference(format_id), - 'url': m.group('torrent_url'), - }) self._sort_formats(formats) - thumbnail = self._html_search_regex( - r"[0-9]+)' + _TEST = { + 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', + 'md5': '2f639d446394f53f3a33658b518b6615', + 'info_dict': { + 'id': '1288527', + 'ext': 'mp4', + 'title': 'Turn any video into an impressionist masterpiece', + 'description': 'md5:88ddbcb504367987b2708bb38677c9d2', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = self._parse_json(self._search_regex( + r"data-opts='({.+?})'", webpage, 'video data'), video_id) + title = video_data['title'] + video_sources = self._download_json(video_data.get( + 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + + formats = [] + for rendition in video_sources['renditions']: + rendition_url = rendition.get('url') + if not rendition_url: + continue + tbr = int_or_none(rendition.get('encodingRate'), 1000) + container = rendition.get('videoContainer') + is_hls = container == 'M2TS' + protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url}) + formats.append({ + 'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''), + 'url': rendition_url, + 'width': int_or_none(rendition.get('frameWidth')), + 'height': int_or_none(rendition.get('frameHeight')), + 'tbr': tbr, + 'vcodec': rendition.get('videoCodec'), + 'container': container, + 'protocol': protocol, + 'ext': 'mp4' if is_hls else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('descr'), + 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 7c554ec14..55853f76f 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -71,7 +71,7 @@ class DiscoveryIE(InfoExtractor): entries = [] for idx, video_info in enumerate(info['playlist']): - subtitles = [] + subtitles = {} caption_url = video_info.get('captionsUrl') if caption_url: subtitles = { diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 737960a01..a0bb3d4c2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -75,6 +75,7 @@ from .bigflix import BigflixIE from .bild import BildIE from .bilibili import BiliBiliIE from .biobiochiletv import BioBioChileTVIE +from .biqle import BIQLEIE from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, @@ -157,6 +158,7 @@ from .cspan import CSpanIE from .ctsnews import CtsNewsIE from .cultureunplugged import CultureUnpluggedIE from .cwtv import CWTVIE +from .dailymail import DailyMailIE from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, @@ -382,6 +384,7 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) +from .litv import LiTVIE from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, @@ -406,6 +409,10 @@ from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE from .mgtv import MGTVIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE @@ -560,7 +567,10 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .people import PeopleIE -from .periscope import PeriscopeIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE @@ -724,7 +734,10 @@ from .svt import ( from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE -from .tagesschau import TagesschauIE +from .tagesschau import ( + TagesschauPlayerIE, + TagesschauIE, +) from .tapely import TapelyIE from .tass import TassIE from .tdslifeway import TDSLifewayIE @@ -846,7 +859,10 @@ from .veehd import VeeHDIE from .veoh import VeohIE from .vessel import VesselIE from .vesti import VestiIE -from .vevo import VevoIE +from .vevo import ( + VevoIE, + VevoPlaylistIE, +) from .vgtv import ( BTArticleIE, BTVestlendingenIE, diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py index f1f150ef2..8d1010b88 100644 --- a/youtube_dl/extractor/fczenit.py +++ b/youtube_dl/extractor/fczenit.py @@ -1,20 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urlparse class FczenitIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P[0-9]+)' _TEST = { - 'url': 'http://fc-zenit.ru/video/gl6785/', - 'md5': '458bacc24549173fe5a5aa29174a5606', + 'url': 'http://fc-zenit.ru/video/41044/', + 'md5': '0e3fab421b455e970fa1aa3891e57df0', 'info_dict': { - 'id': '6785', + 'id': '41044', 'ext': 'mp4', - 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', + 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»', }, } @@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'
([^<]+)', webpage, 'title') + video_title = self._html_search_regex( + r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title') - bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') - bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) + video_items = self._parse_json(self._search_regex( + r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'), + video_id) + + def merge_dicts(*dicts): + ret = {} + for a_dict in dicts: + ret.update(a_dict) + return ret formats = [{ - 'url': furl, - 'tbr': tbr, - } for furl, tbr in bitrates] + 'url': compat_urlparse.urljoin(url, video_url), + 'tbr': int(tbr), + } for tbr, video_url in merge_dicts(*video_items).items()] self._sort_formats(formats) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 0a3de1498..a8e1bf42a 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -24,13 +24,28 @@ class FlickrIE(InfoExtractor): 'upload_date': '20110423', 'uploader_id': '10922353@N03', 'uploader': 'Forest Wander', + 'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/', 'comment_count': int, 'view_count': int, 'tags': list, + 'license': 'Attribution-ShareAlike', } } - _API_BASE_URL = 'https://api.flickr.com/services/rest?' + # https://help.yahoo.com/kb/flickr/SLN25525.html + _LICENSES = { + '0': 'All Rights Reserved', + '1': 'Attribution-NonCommercial-ShareAlike', + '2': 'Attribution-NonCommercial', + '3': 'Attribution-NonCommercial-NoDerivs', + '4': 'Attribution', + '5': 'Attribution-ShareAlike', + '6': 'Attribution-NoDerivs', + '7': 'No known copyright restrictions', + '8': 'United States government work', + '9': 'Public Domain Dedication (CC0)', + '10': 'Public Domain Work', + } def _call_api(self, method, video_id, api_key, note, secret=None): query = { @@ -75,6 +90,9 @@ class FlickrIE(InfoExtractor): self._sort_formats(formats) owner = video_info.get('owner', {}) + uploader_id = owner.get('nsid') + uploader_path = owner.get('path_alias') or uploader_id + uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None return { 'id': video_id, @@ -83,11 +101,13 @@ class FlickrIE(InfoExtractor): 'formats': formats, 'timestamp': int_or_none(video_info.get('dateuploaded')), 'duration': int_or_none(video_info.get('video', {}).get('duration')), - 'uploader_id': owner.get('nsid'), + 'uploader_id': uploader_id, 'uploader': owner.get('realname'), + 'uploader_url': uploader_url, 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), 'view_count': int_or_none(video_info.get('views')), - 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] + 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])], + 'license': self._LICENSES.get(video_info.get('license')), } else: raise ExtractorError('not a video', expected=True) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 3740869c7..11b31a699 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -283,6 +283,8 @@ class KuwoCategoryIE(InfoExtractor): category_desc = remove_start( get_element_by_id('intro', webpage).strip(), '%s简介:' % category_name) + if category_desc == '暂无': + category_desc = None jsonm = self._parse_json(self._html_search_regex( r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py new file mode 100644 index 000000000..3356d015d --- /dev/null +++ b/youtube_dl/extractor/litv.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + smuggle_url, + unsmuggle_url, +) + + +class LiTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P[^&]+)' + + _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + + _TESTS = [{ + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041606', + 'title': '花千骨', + }, + 'playlist_count': 50, + }, { + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041610', + 'ext': 'mp4', + 'title': '花千骨第1集', + 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', + 'episode_number': 1, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, # m3u8 download + }, + 'skip': 'Georestricted to Taiwan', + }] + + def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True): + episode_title = view_data['title'] + content_id = season_list['contentId'] + + if prompt: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id)) + + all_episodes = [ + self.url_result(smuggle_url( + self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']), + {'force_noplaylist': True})) # To prevent infinite recursion + for episode in season_list['episode']] + + return self.playlist_result(all_episodes, content_id, episode_title) + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + + video_id = self._match_id(url) + + noplaylist = self._downloader.params.get('noplaylist') + noplaylist_prompt = True + if 'force_noplaylist' in data: + noplaylist = data['force_noplaylist'] + noplaylist_prompt = False + + webpage = self._download_webpage(url, video_id) + + view_data = dict(map(lambda t: (t[0], t[2]), re.findall( + r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2', + webpage))) + + vod_data = self._parse_json(self._search_regex( + 'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), + video_id) + + season_list = list(vod_data.get('seasonList', {}).values()) + if season_list: + if not noplaylist: + return self._extract_playlist( + season_list[0], video_id, vod_data, view_data, + prompt=noplaylist_prompt) + + if noplaylist_prompt: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + # In browsers `getMainUrl` request is always issued. Usually this + # endpoint gives the same result as the data embedded in the webpage. + # If georestricted, there are no embedded data, so an extra request is + # necessary to get the error code + video_data = self._parse_json(self._search_regex( + r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', + webpage, 'video data', default='{}'), video_id) + if not video_data: + payload = { + 'assetId': view_data['assetId'], + 'watchDevices': vod_data['watchDevices'], + 'contentType': view_data['contentType'], + } + video_data = self._download_json( + 'https://www.litv.tv/vod/getMainUrl', video_id, + data=json.dumps(payload).encode('utf-8'), + headers={'Content-Type': 'application/json'}) + + if not video_data.get('fullpath'): + error_msg = video_data.get('errorMessage') + if error_msg == 'vod.error.outsideregionerror': + self.raise_geo_restricted('This video is available in Taiwan only') + if error_msg: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True) + raise ExtractorError('Unexpected result from %s' % self.IE_NAME) + + formats = self._extract_m3u8_formats( + video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls') + for a_format in formats: + # LiTV HLS segments doesn't like compressions + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + + title = view_data['title'] + view_data.get('secondaryMark', '') + description = view_data.get('description') + thumbnail = view_data.get('imageFile') + categories = [item['name'] for item in vod_data.get('category', [])] + episode = int_or_none(view_data.get('episode')) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'categories': categories, + 'episode_number': episode, + } diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 29fba5f30..ea0565ac0 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -17,7 +17,8 @@ class LiveLeakIE(InfoExtractor): 'ext': 'flv', 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', - 'title': 'Most unlucky car accident' + 'title': 'Most unlucky car accident', + 'thumbnail': 're:^https?://.*\.jpg$' } }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', @@ -28,6 +29,7 @@ class LiveLeakIE(InfoExtractor): 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', 'uploader': 'ARD_Stinkt', 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', + 'thumbnail': 're:^https?://.*\.jpg$' } }, { 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', @@ -49,7 +51,8 @@ class LiveLeakIE(InfoExtractor): 'ext': 'mp4', 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.', 'uploader': 'bony333', - 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia' + 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia', + 'thumbnail': 're:^https?://.*\.jpg$' } }] @@ -72,6 +75,7 @@ class LiveLeakIE(InfoExtractor): age_limit = int_or_none(self._search_regex( r'you confirm that you are ([0-9]+) years and over.', webpage, 'age limit', default=None)) + video_thumbnail = self._og_search_thumbnail(webpage) sources_raw = self._search_regex( r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) @@ -124,4 +128,5 @@ class LiveLeakIE(InfoExtractor): 'uploader': video_uploader, 'formats': formats, 'age_limit': age_limit, + 'thumbnail': video_thumbnail, } diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py new file mode 100644 index 000000000..afd3e98ec --- /dev/null +++ b/youtube_dl/extractor/microsoftvirtualacademy.py @@ -0,0 +1,192 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_xpath, +) +from ..utils import ( + int_or_none, + parse_duration, + smuggle_url, + unsmuggle_url, + xpath_text, +) + + +class MicrosoftVirtualAcademyBaseIE(InfoExtractor): + def _extract_base_url(self, course_id, display_id): + return self._download_json( + 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id, + display_id, 'Downloading course base URL') + + def _extract_chapter_and_title(self, title): + if not title: + return None, None + m = re.search(r'(?P\d+)\s*\|\s*(?P.+)', title) + return (int(m.group('chapter')), m.group('title')) if m else (None, title) + + +class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva' + IE_DESC = 'Microsoft Virtual Academy videos' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', + 'md5': '7826c44fc31678b12ad8db11f6b5abb9', + 'info_dict': { + 'id': 'gfVXISmEB_6804984382', + 'ext': 'mp4', + 'title': 'Course Introduction', + 'formats': 'mincount:3', + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + } + }, { + 'url': 'mva:11788:gfVXISmEB_6804984382', + 'only_matching': True, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('course_id') + video_id = mobj.group('id') + + base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) + + settings = self._download_xml( + '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), + video_id, 'Downloading video settings XML') + + _, title = self._extract_chapter_and_title(xpath_text( + settings, './/Title', 'title', fatal=True)) + + formats = [] + + for sources in settings.findall(compat_xpath('.//MediaSources')): + if sources.get('videoType') == 'smoothstreaming': + continue + for source in sources.findall(compat_xpath('./MediaSource')): + video_url = source.text + if not video_url or not video_url.startswith('http'): + continue + video_mode = source.get('videoMode') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) + codec = source.get('codec') + acodec, vcodec = [None] * 2 + if codec: + codecs = codec.split(',') + if len(codecs) == 2: + acodec, vcodec = codecs + elif len(codecs) == 1: + vcodec = codecs[0] + formats.append({ + 'url': video_url, + 'format_id': video_mode, + 'height': height, + 'acodec': acodec, + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + subtitles = {} + for source in settings.findall(compat_xpath('.//MarkerResourceSource')): + subtitle_url = source.text + if not subtitle_url: + continue + subtitles.setdefault('en', []).append({ + 'url': '%s/%s' % (base_url, subtitle_url), + 'ext': source.get('type'), + }) + + return { + 'id': video_id, + 'title': title, + 'subtitles': subtitles, + 'formats': formats + } + + +class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva:course' + IE_DESC = 'Microsoft Virtual Academy courses' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'info_dict': { + 'id': '11788', + 'title': 'Microsoft Azure Fundamentals: Virtual Machines', + }, + 'playlist_count': 36, + }, { + # with emphasized chapters + 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', + 'info_dict': { + 'id': '16335', + 'title': 'Developing Windows 10 Games with Construct 2', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'only_matching': True, + }, { + 'url': 'mva:course:11788', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MicrosoftVirtualAcademyIE.suitable(url) else super( + MicrosoftVirtualAcademyCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('id') + display_id = mobj.group('display_id') + + base_url = self._extract_base_url(course_id, display_id) + + manifest = self._download_json( + '%s/imsmanifestlite.json' % base_url, + display_id, 'Downloading course manifest JSON')['manifest'] + + organization = manifest['organizations']['organization'][0] + + entries = [] + for chapter in organization['item']: + chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) + chapter_id = chapter.get('@identifier') + for item in chapter.get('item', []): + item_id = item.get('@identifier') + if not item_id: + continue + metadata = item.get('resource', {}).get('metadata') or {} + if metadata.get('learningresourcetype') != 'Video': + continue + _, title = self._extract_chapter_and_title(item.get('title')) + duration = parse_duration(metadata.get('duration')) + description = metadata.get('description') + entries.append({ + '_type': 'url_transparent', + 'url': smuggle_url( + 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), + 'title': title, + 'description': description, + 'duration': duration, + 'chapter': chapter_title, + 'chapter_number': chapter_number, + 'chapter_id': chapter_id, + }) + + title = organization.get('title') or manifest.get('metadata', {}).get('title') + + return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 514e9b433..0a4bc761d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -7,6 +7,7 @@ from ..utils import parse_iso8601 class PeriscopeIE(InfoExtractor): IE_DESC = 'Periscope' + IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)' # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ @@ -79,3 +80,39 @@ class PeriscopeIE(InfoExtractor): 'thumbnails': thumbnails, 'formats': formats, } + + +class PeriscopeUserIE(InfoExtractor): + _VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$' + IE_DESC = 'Periscope user videos' + IE_NAME = 'periscope:user' + + _TEST = { + 'url': 'https://www.periscope.tv/LularoeHusbandMike/', + 'info_dict': { + 'id': 'LularoeHusbandMike', + 'title': 'LULAROE HUSBAND MIKE', + }, + # Periscope only shows videos in the last 24 hours, so it's possible to + # get 0 videos + 'playlist_mincount': 0, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage(url, user_id) + + broadcast_data = self._parse_json(self._html_search_meta( + 'broadcast-data', webpage, default='{}'), user_id) + username = broadcast_data.get('user', {}).get('display_name') + user_broadcasts = self._parse_json( + self._html_search_meta('user-broadcasts', webpage, default='{}'), + user_id) + + entries = [ + self.url_result( + 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id'])) + for broadcast in user_broadcasts.get('broadcasts', [])] + + return self.playlist_result(entries, user_id, username) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 7ba41ba59..721fc3a9e 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,7 +1,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + str_to_int, + unified_strdate, +) class RedTubeIE(InfoExtractor): @@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor): 'id': '66418', 'ext': 'mp4', 'title': 'Sucked on a toilet', + 'upload_date': '20120831', + 'duration': 596, + 'view_count': int, 'age_limit': 18, } } @@ -24,12 +32,39 @@ class RedTubeIE(InfoExtractor): if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): raise ExtractorError('Video %s has been removed' % video_id, expected=True) - video_url = self._html_search_regex( - r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') - video_title = self._html_search_regex( - r'<h1 class="videoTitle[^"]*">(.+?)</h1>', - webpage, 'title') - video_thumbnail = self._og_search_thumbnail(webpage) + title = self._html_search_regex( + (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>', + r'videoTitle\s*:\s*(["\'])(?P<title>)\1'), + webpage, 'title', group='title') + + formats = [] + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), + video_id, fatal=False) + if sources and isinstance(sources, dict): + for format_id, format_url in sources.items(): + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + else: + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') + formats.append({'url': video_url}) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', + webpage, 'upload date', fatal=False)) + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + view_count = str_to_int(self._search_regex( + r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', + webpage, 'view count', fatal=False)) # No self-labeling, but they describe themselves as # "Home of Videos Porno" @@ -37,9 +72,12 @@ class RedTubeIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': video_thumbnail, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, 'age_limit': age_limit, + 'formats': formats, } diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index fcccb230c..136e18f96 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -4,42 +4,178 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_filesize +from ..utils import ( + determine_ext, + js_to_json, + parse_iso8601, + parse_filesize, +) + + +class TagesschauPlayerIE(InfoExtractor): + IE_NAME = 'tagesschau:player' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html' + + _TESTS = [{ + 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', + 'md5': '8d09548d5c15debad38bee3a4d15ca21', + 'info_dict': { + 'id': '179517', + 'ext': 'mp4', + 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', + 'thumbnail': 're:^https?:.*\.jpg$', + 'formats': 'mincount:6', + }, + }, { + 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', + 'md5': '76e6eec6ebd40740671cf0a2c88617e5', + 'info_dict': { + 'id': '29417', + 'ext': 'mp3', + 'title': 'Trabi - Bye, bye Rennpappe', + 'thumbnail': 're:^https?:.*\.jpg$', + 'formats': 'mincount:2', + }, + }, { + 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html', + 'only_matching': True, + }] + + _FORMATS = { + 'xs': {'quality': 0}, + 's': {'width': 320, 'height': 180, 'quality': 1}, + 'm': {'width': 512, 'height': 288, 'quality': 2}, + 'l': {'width': 960, 'height': 540, 'quality': 3}, + 'xl': {'width': 1280, 'height': 720, 'quality': 4}, + 'xxl': {'quality': 5}, + } + + def _extract_via_api(self, kind, video_id): + info = self._download_json( + 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id), + video_id) + title = info['headline'] + formats = [] + for media in info['mediadata']: + for format_id, format_url in media.items(): + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls')) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none' if kind == 'audio' else None, + }) + self._sort_formats(formats) + timestamp = parse_iso8601(info.get('date')) + return { + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'formats': formats, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # kind = mobj.group('kind').lower() + # if kind == 'video': + # return self._extract_via_api(kind, video_id) + + # JSON api does not provide some audio formats (e.g. ogg) thus + # extractiong audio via webpage + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage).strip() + formats = [] + + for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage): + media = self._parse_json(js_to_json(media_json), video_id, fatal=False) + if not media: + continue + src = media.get('src') + if not src: + return + quality = media.get('quality') + kind = media.get('type', '').split('/')[0] + ext = determine_ext(src) + f = { + 'url': src, + 'format_id': '%s_%s' % (quality, ext) if quality else ext, + 'ext': ext, + 'vcodec': 'none' if kind == 'audio' else None, + } + f.update(self._FORMATS.get(quality, {})) + formats.append(f) + + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_?[^/#?]+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', - 'md5': '917a228bc7df7850783bc47979673a09', + 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', 'info_dict': { - 'id': '102143', + 'id': 'video-102143', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', - 'description': 'md5:171feccd9d9b3dd54d05d501568f6359', + 'description': '18.07.2015 20:10 Uhr', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', 'md5': '3c54c1f6243d279b706bde660ceec633', 'info_dict': { - 'id': '5727', + 'id': 'ts-5727', 'ext': 'mp4', - 'description': 'md5:695c01bfd98b7e313c501386327aea59', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', + 'description': 'md5:695c01bfd98b7e313c501386327aea59', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { - 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', - 'md5': 'aef45de271c4bf0a5db834aa40bf774c', + # exclusive audio + 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', + 'md5': '76e6eec6ebd40740671cf0a2c88617e5', 'info_dict': { - 'id': '18407', + 'id': 'audio-29417', 'ext': 'mp3', - 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', - 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', + 'title': 'Trabi - Bye, bye Rennpappe', + 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', 'thumbnail': 're:^https?:.*\.jpg$', }, + }, { + # audio in article + 'url': 'http://www.tagesschau.de/inland/bnd-303.html', + 'md5': 'e0916c623e85fc1d2b26b78f299d3958', + 'info_dict': { + 'id': 'bnd-303', + 'ext': 'mp3', + 'title': 'Viele Baustellen für neuen BND-Chef', + 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', + 'thumbnail': 're:^https?:.*\.jpg$', + }, + }, { + 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', + 'info_dict': { + 'id': 'afd-parteitag-135', + 'title': 'Möchtegern-Underdog mit Machtanspruch', + }, + 'playlist_count': 2, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 'only_matching': True, @@ -62,93 +198,107 @@ class TagesschauIE(InfoExtractor): 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'only_matching': True, }, { - 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', + 'url': 'http://www.tagesschau.de/100sekunden/index.html', + 'only_matching': True, + }, { + # playlist article with collapsing sections + 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', 'only_matching': True, }] - _FORMATS = { - 'xs': {'quality': 0}, - 's': {'width': 320, 'height': 180, 'quality': 1}, - 'm': {'width': 512, 'height': 288, 'quality': 2}, - 'l': {'width': 960, 'height': 540, 'quality': 3}, - 'xl': {'width': 1280, 'height': 720, 'quality': 4}, - 'xxl': {'quality': 5}, - } + @classmethod + def suitable(cls, url): + return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url) + + def _extract_formats(self, download_text, media_kind): + links = re.finditer( + r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', + download_text) + formats = [] + for l in links: + link_url = l.group('url') + if not link_url: + continue + format_id = self._search_regex( + r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', + default=determine_ext(link_url)) + format = { + 'format_id': format_id, + 'url': l.group('url'), + 'format_name': l.group('name'), + } + title = l.group('title') + if title: + if media_kind.lower() == 'video': + m = re.match( + r'''(?x) + Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; + (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; + (?P<vbr>[0-9]+)kbps&\#10; + Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; + Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', + title) + if m: + format.update({ + 'format_note': m.group('audio_desc'), + 'vcodec': m.group('vcodec'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'abr': int(m.group('abr')), + 'vbr': int(m.group('vbr')), + 'filesize_approx': parse_filesize(m.group('filesize_approx')), + }) + else: + m = re.match( + r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)', + title) + if m: + format.update({ + 'format_note': '%s, %s' % (m.group('format'), m.group('note')), + 'vcodec': 'none', + 'abr': int(m.group('abr')), + }) + formats.append(format) + self._sort_formats(formats) + return formats def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('path') display_id = video_id.lstrip('-') + webpage = self._download_webpage(url, display_id) - player_url = self._html_search_meta( - 'twitter:player', webpage, 'player URL', default=None) - if player_url: - playerpage = self._download_webpage( - player_url, display_id, 'Downloading player page') + title = self._html_search_regex( + r'<span[^>]*class="headline"[^>]*>(.+?)</span>', + webpage, 'title', default=None) or self._og_search_title(webpage) - formats = [] - for media in re.finditer( - r'''(?x) - (?P<q_url>["\'])(?P<url>http://media.+?)(?P=q_url) - ,\s*type:(?P<q_type>["\'])(?P<type>video|audio)/(?P<ext>.+?)(?P=q_type) - (?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))? - ''', playerpage): - url = media.group('url') - type_ = media.group('type') - ext = media.group('ext') - res = media.group('quality') - f = { - 'format_id': '%s_%s' % (res, ext) if res else ext, - 'url': url, - 'ext': ext, - 'vcodec': 'none' if type_ == 'audio' else None, - } - f.update(self._FORMATS.get(res, {})) - formats.append(f) - thumbnail = self._og_search_thumbnail(playerpage) - title = self._og_search_title(webpage).strip() - description = self._og_search_description(webpage).strip() - else: + DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>' + + webpage_type = self._og_search_property('type', webpage, default=None) + if webpage_type == 'website': # Article + entries = [] + for num, (entry_title, media_kind, download_text) in enumerate(re.findall( + r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, + webpage), 1): + entries.append({ + 'id': '%s-%d' % (display_id, num), + 'title': '%s' % entry_title, + 'formats': self._extract_formats(download_text, media_kind), + }) + if len(entries) > 1: + return self.playlist_result(entries, display_id, title) + formats = entries[0]['formats'] + else: # Assume single video download_text = self._search_regex( - r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>', - webpage, 'download links') - links = re.finditer( - r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', - download_text) - formats = [] - for l in links: - format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') - format = { - 'format_id': format_id, - 'url': l.group('url'), - 'format_name': l.group('name'), - } - m = re.match( - r'''(?x) - Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; - (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; - (?P<vbr>[0-9]+)kbps&\#10; - Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; - Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', - l.group('title')) - if m: - format.update({ - 'format_note': m.group('audio_desc'), - 'vcodec': m.group('vcodec'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'abr': int(m.group('abr')), - 'vbr': int(m.group('vbr')), - 'filesize_approx': parse_filesize(m.group('filesize_approx')), - }) - formats.append(format) - thumbnail = self._og_search_thumbnail(webpage) - description = self._html_search_regex( - r'(?s)<p class="teasertext">(.*?)</p>', - webpage, 'description', default=None) - title = self._html_search_regex( - r'<span class="headline".*?>(.*?)</span>', webpage, 'title') + DOWNLOAD_REGEX, webpage, 'download links', group='links') + media_kind = self._search_regex( + DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind') + formats = self._extract_formats(download_text, media_kind) + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'(?s)<p class="teasertext">(.*?)</p>', + webpage, 'description', default=None) self._sort_formats(formats) diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 6f8333cfc..9092e9b85 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -2,14 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + determine_ext, + remove_end, +) class TelegraafIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html' _TEST = { 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', - 'md5': '83245a9779bcc4a24454bfd53c65b6dc', 'info_dict': { 'id': '24353229', 'ext': 'mp4', @@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 33, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): - playlist_id = self._match_id(url) + video_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage = self._download_webpage(url, video_id) + player_url = self._html_search_regex( + r'<iframe[^>]+src="([^"]+")', webpage, 'player URL') + player_page = self._download_webpage( + player_url, video_id, note='Download player webpage') playlist_url = self._search_regex( - r"iframe\.loadPlayer\('([^']+)'", webpage, 'player') + r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL') + playlist_data = self._download_json(playlist_url, video_id) + + item = playlist_data['items'][0] + formats = [] + locations = item['locations'] + for location in locations.get('adaptive', []): + manifest_url = location['src'] + ext = determine_ext(manifest_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, ext='mp4', m3u8_id='hls')) + elif ext == 'mpd': + # TODO: Current DASH formats are broken - $Time$ pattern in + # <SegmentTemplate> not implemented yet + continue + else: + self.report_warning('Unknown adaptive format %s' % ext) + for location in locations.get('progressive', []): + formats.append({ + 'url': location['sources'][0]['src'], + 'width': location.get('width'), + 'height': location.get('height'), + 'format_id': 'http-%s' % location['label'], + }) + + self._sort_formats(formats) - entries = self._extract_xspf_playlist(playlist_url, playlist_id) title = remove_end(self._og_search_title(webpage), ' - VIDEO') description = self._og_search_description(webpage) + duration = item.get('duration') + thumbnail = item.get('poster') - return self.playlist_result(entries, playlist_id, title, description) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': duration, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index d1e6f2703..13e0cd237 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, ) @@ -84,18 +83,19 @@ class UdemyIE(InfoExtractor): if enroll_url: webpage = self._download_webpage( combine_url(base_url, enroll_url), - course_id, 'Enrolling in the course') + course_id, 'Enrolling in the course', + headers={'Referer': base_url}) if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) def _download_lecture(self, course_id, lecture_id): return self._download_json( - 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( - course_id, lecture_id, compat_urllib_parse_urlencode({ - 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', - })), - lecture_id, 'Downloading lecture JSON') + 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?' + % (course_id, lecture_id), + lecture_id, 'Downloading lecture JSON', query={ + 'fields[lecture]': 'title,description,view_html,asset', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + }) def _handle_error(self, response): if not isinstance(response, dict): @@ -155,13 +155,13 @@ class UdemyIE(InfoExtractor): 'password': password, }) - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Referer', self._ORIGIN_URL) - request.add_header('Origin', self._ORIGIN_URL) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + self._LOGIN_URL, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._ORIGIN_URL, + 'Origin': self._ORIGIN_URL, + }) if not is_logged(response): error = self._html_search_regex( diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 147480f64..c0632cd6a 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -3,7 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_etree_fromstring +from ..compat import ( + compat_etree_fromstring, + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -12,13 +16,22 @@ from ..utils import ( ) -class VevoIE(InfoExtractor): +class VevoBaseIE(InfoExtractor): + def _extract_json(self, webpage, video_id, item): + return self._parse_json( + self._search_regex( + r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*</script>', + webpage, 'initial store'), + video_id)['default'][item] + + +class VevoIE(VevoBaseIE): ''' Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE and MySpaceIE) ''' _VALID_URL = r'''(?x) - (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?| + (?:https?://www\.vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?| https?://cache\.vevo\.com/m/html/embed\.html\?video=| https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| vevo:) @@ -30,11 +43,15 @@ class VevoIE(InfoExtractor): 'info_dict': { 'id': 'GB1101300280', 'ext': 'mp4', - 'title': 'Somebody to Die For', + 'title': 'Hurts - Somebody to Die For', + 'timestamp': 1372057200, 'upload_date': '20130624', 'uploader': 'Hurts', - 'timestamp': 1372057200, + 'track': 'Somebody to Die For', + 'artist': 'Hurts', + 'genre': 'Pop', }, + 'expected_warnings': ['Unable to download SMIL file'], }, { 'note': 'v3 SMIL format', 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', @@ -42,23 +59,31 @@ class VevoIE(InfoExtractor): 'info_dict': { 'id': 'USUV71302923', 'ext': 'mp4', - 'title': 'I Wish I Could Break Your Heart', + 'title': 'Cassadee Pope - I Wish I Could Break Your Heart', + 'timestamp': 1392796919, 'upload_date': '20140219', 'uploader': 'Cassadee Pope', - 'timestamp': 1392796919, + 'track': 'I Wish I Could Break Your Heart', + 'artist': 'Cassadee Pope', + 'genre': 'Country', }, + 'expected_warnings': ['Unable to download SMIL file'], }, { 'note': 'Age-limited video', 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', 'info_dict': { 'id': 'USRV81300282', 'ext': 'mp4', - 'title': 'Tunnel Vision (Explicit)', - 'upload_date': '20130703', + 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', 'age_limit': 18, - 'uploader': 'Justin Timberlake', 'timestamp': 1372888800, + 'upload_date': '20130703', + 'uploader': 'Justin Timberlake', + 'track': 'Tunnel Vision (Explicit)', + 'artist': 'Justin Timberlake', + 'genre': 'Pop', }, + 'expected_warnings': ['Unable to download SMIL file'], }, { 'note': 'No video_info', 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', @@ -66,12 +91,36 @@ class VevoIE(InfoExtractor): 'info_dict': { 'id': 'USUV71503000', 'ext': 'mp4', - 'title': 'Till I Die', - 'upload_date': '20151207', + 'title': 'K Camp - Till I Die', 'age_limit': 18, - 'uploader': 'K Camp', 'timestamp': 1449468000, + 'upload_date': '20151207', + 'uploader': 'K Camp', + 'track': 'Till I Die', + 'artist': 'K Camp', + 'genre': 'Rap/Hip-Hop', }, + }, { + 'note': 'Only available via webpage', + 'url': 'http://www.vevo.com/watch/GBUV71600656', + 'md5': '67e79210613865b66a47c33baa5e37fe', + 'info_dict': { + 'id': 'GBUV71600656', + 'ext': 'mp4', + 'title': 'ABC - Viva Love', + 'age_limit': 0, + 'timestamp': 1461830400, + 'upload_date': '20160428', + 'uploader': 'ABC', + 'track': 'Viva Love', + 'artist': 'ABC', + 'genre': 'Pop', + }, + 'expected_warnings': ['Failed to download video versions info'], + }, { + # no genres available + 'url': 'http://www.vevo.com/watch/INS171400764', + 'only_matching': True, }] _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com' _SOURCE_TYPES = { @@ -140,30 +189,31 @@ class VevoIE(InfoExtractor): errnote='Unable to retrieve oauth token') if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage: - raise ExtractorError( - '%s said: This page is currently unavailable in your region.' % self.IE_NAME, expected=True) + self.raise_geo_restricted( + '%s said: This page is currently unavailable in your region' % self.IE_NAME) auth_info = self._parse_json(webpage, video_id) self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] - def _call_api(self, path, video_id, note, errnote, fatal=True): - return self._download_json(self._api_url_template % path, video_id, note, errnote) + def _call_api(self, path, *args, **kwargs): + return self._download_json(self._api_url_template % path, *args, **kwargs) def _real_extract(self, url): video_id = self._match_id(url) json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id response = self._download_json( - json_url, video_id, 'Downloading video info', 'Unable to download info') + json_url, video_id, 'Downloading video info', + 'Unable to download info', fatal=False) or {} video_info = response.get('video') or {} - video_versions = video_info.get('videoVersions') + artist = None + featured_artist = None uploader = None - timestamp = None view_count = None formats = [] if not video_info: - if response.get('statusCode') != 909: + if response and response.get('statusCode') != 909: ytid = response.get('errorInfo', {}).get('ytid') if ytid: self.report_warning( @@ -183,12 +233,19 @@ class VevoIE(InfoExtractor): video_versions = self._call_api( 'video/%s/streams' % video_id, video_id, 'Downloading video versions info', - 'Failed to download video versions info') + 'Failed to download video versions info', + fatal=False) + + # Some videos are only available via webpage (e.g. + # https://github.com/rg3/youtube-dl/issues/9366) + if not video_versions: + webpage = self._download_webpage(url, video_id) + video_versions = self._extract_json(webpage, video_id, 'streams')[video_id][0] timestamp = parse_iso8601(video_info.get('releaseDate')) artists = video_info.get('artists') if artists: - uploader = artists[0]['name'] + artist = uploader = artists[0]['name'] view_count = int_or_none(video_info.get('views', {}).get('total')) for video_version in video_versions: @@ -241,7 +298,11 @@ class VevoIE(InfoExtractor): scale=1000) artists = video_info.get('mainArtists') if artists: - uploader = artists[0]['artistName'] + artist = uploader = artists[0]['artistName'] + + featured_artists = video_info.get('featuredArtists') + if featured_artists: + featured_artist = featured_artists[0]['artistName'] smil_parsed = False for video_version in video_info['videoVersions']: @@ -278,7 +339,15 @@ class VevoIE(InfoExtractor): smil_parsed = True self._sort_formats(formats) - title = video_info['title'] + track = video_info['title'] + if featured_artist: + artist = '%s ft. %s' % (artist, featured_artist) + title = '%s - %s' % (artist, track) if artist else track + + genres = video_info.get('genres') + genre = ( + genres[0] if genres and isinstance(genres, list) and + isinstance(genres[0], compat_str) else None) is_explicit = video_info.get('isExplicit') if is_explicit is True: @@ -300,4 +369,75 @@ class VevoIE(InfoExtractor): 'duration': duration, 'view_count': view_count, 'age_limit': age_limit, + 'track': track, + 'artist': uploader, + 'genre': genre, } + + +class VevoPlaylistIE(VevoBaseIE): + _VALID_URL = r'https?://www\.vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)' + + _TESTS = [{ + 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29', + 'info_dict': { + 'id': 'dadbf4e7-b99f-4184-9670-6f0e547b6a29', + 'title': 'Best-Of: Birdman', + }, + 'playlist_count': 10, + }, { + 'url': 'http://www.vevo.com/watch/genre/rock', + 'info_dict': { + 'id': 'rock', + 'title': 'Rock', + }, + 'playlist_count': 20, + }, { + 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0', + 'md5': '32dcdfddddf9ec6917fc88ca26d36282', + 'info_dict': { + 'id': 'USCMV1100073', + 'ext': 'mp4', + 'title': 'Birdman - Y.U. MAD', + 'timestamp': 1323417600, + 'upload_date': '20111209', + 'uploader': 'Birdman', + 'track': 'Y.U. MAD', + 'artist': 'Birdman', + 'genre': 'Rap/Hip-Hop', + }, + 'expected_warnings': ['Unable to download SMIL file'], + }, { + 'url': 'http://www.vevo.com/watch/genre/rock?index=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + playlist_kind = mobj.group('kind') + + webpage = self._download_webpage(url, playlist_id) + + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + index = qs.get('index', [None])[0] + + if index: + video_id = self._search_regex( + r'<meta[^>]+content=(["\'])vevo://video/(?P<id>.+?)\1[^>]*>', + webpage, 'video id', default=None, group='id') + if video_id: + return self.url_result('vevo:%s' % video_id, VevoIE.ie_key()) + + playlists = self._extract_json(webpage, playlist_id, '%ss' % playlist_kind) + + playlist = (list(playlists.values())[0] + if playlist_kind == 'playlist' else playlists[playlist_id]) + + entries = [ + self.url_result('vevo:%s' % src, VevoIE.ie_key()) + for src in playlist['isrcs']] + + return self.playlist_result( + entries, playlist.get('playlistId') or playlist_id, + playlist.get('name'), playlist.get('description')) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 67220f1b7..041d93629 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -26,12 +26,16 @@ class VKIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)| + (?: + (?:m\.)?vk\.com/video_| + (?:www\.)?daxab.com/ + ) + ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| (?: (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| - (?:www\.)?biqle\.ru/watch/ + (?:www\.)?daxab.com/embed/ ) - (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$) + (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? ) ''' _NETRC_MACHINE = 'vk' @@ -75,7 +79,8 @@ class VKIE(InfoExtractor): 'duration': 101, 'upload_date': '20120730', 'view_count': int, - } + }, + 'skip': 'This video has been removed from public access.', }, { # VIDEO NOW REMOVED @@ -142,7 +147,7 @@ class VKIE(InfoExtractor): 'id': 'V3K4mi0SYkc', 'ext': 'webm', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", - 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', + 'description': 'md5:d9903938abdc74c738af77f527ca0596', 'duration': 178, 'upload_date': '20130116', 'uploader': "Children's Joy Foundation", @@ -173,11 +178,6 @@ class VKIE(InfoExtractor): 'url': 'https://vk.com/video205387401_164765225', 'only_matching': True, }, - { - # vk wrapper - 'url': 'http://www.biqle.ru/watch/847655_160197695', - 'only_matching': True, - }, { # pladform embed 'url': 'https://vk.com/video-76116461_171554880', @@ -217,20 +217,22 @@ class VKIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - if not video_id: + info_url = url + if video_id: + info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + # Some videos (removed?) can only be downloaded with list id specified + list_id = mobj.group('list_id') + if list_id: + info_url += '&list=%s' % list_id + else: + info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query') video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) - info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id - - # Some videos (removed?) can only be downloaded with list id specified - list_id = mobj.group('list_id') - if list_id: - info_url += '&list=%s' % list_id - info_page = self._download_webpage(info_url, video_id) error_message = self._html_search_regex( - r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], info_page, 'error message', default=None) if error_message: raise ExtractorError(error_message, expected=True) @@ -305,17 +307,17 @@ class VKIE(InfoExtractor): view_count = None views = self._html_search_regex( r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', - info_page, 'view count', fatal=False) + info_page, 'view count', default=None) if views: view_count = str_to_int(self._search_regex( r'([\d,.]+)', views, 'view count', fatal=False)) formats = [] for k, v in data.items(): - if not k.startswith('url') and k != 'extra_data' or not v: + if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v: continue height = int_or_none(self._search_regex( - r'^url(\d+)', k, 'height', default=None)) + r'^(?:url|cache)(\d+)', k, 'height', default=None)) formats.append({ 'format_id': k, 'url': v, diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 2d1504eaa..769003735 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -13,12 +13,21 @@ from ..utils import ( class XFileShareIE(InfoExtractor): - IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me' - _VALID_URL = r'''(?x) - https?://(?P<host>(?:www\.)? - (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me|powerwatch\.pw))/ - (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)? - ''' + _SITES = ( + ('daclips.in', 'DaClips'), + ('filehoot.com', 'FileHoot'), + ('gorillavid.in', 'GorillaVid'), + ('movpod.in', 'MovPod'), + ('powerwatch.pw', 'PowerWatch'), + ('rapidvideo.ws', 'Rapidvideo.ws'), + ('thevideobee.to', 'TheVideoBee'), + ('vidto.me', 'Vidto'), + ('streamin.to', 'Streamin.To'), + ) + + IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) + _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0])) _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<' @@ -43,25 +52,6 @@ class XFileShareIE(InfoExtractor): 'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc', 'thumbnail': 're:http://.*\.jpg', } - }, { - # video with countdown timeout - 'url': 'http://fastvideo.in/1qmdn1lmsmbw', - 'md5': '8b87ec3f6564a3108a0e8e66594842ba', - 'info_dict': { - 'id': '1qmdn1lmsmbw', - 'ext': 'mp4', - 'title': 'Man of Steel - Trailer', - 'thumbnail': 're:http://.*\.jpg', - }, - }, { - 'url': 'http://realvid.net/ctn2y6p2eviw', - 'md5': 'b2166d2cf192efd6b6d764c18fd3710e', - 'info_dict': { - 'id': 'ctn2y6p2eviw', - 'ext': 'flv', - 'title': 'rdx 1955', - 'thumbnail': 're:http://.*\.jpg', - }, }, { 'url': 'http://movpod.in/0wguyyxi1yca', 'only_matching': True, diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index e4ed306b4..a6dfc4af9 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -9,6 +9,11 @@ from ..utils import int_or_none class XiamiBaseIE(InfoExtractor): _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id' + def _download_webpage(self, *args, **kwargs): + webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs) + if '>Xiami is currently not available in your country.<' in webpage: + self.raise_geo_restricted('Xiami is currently not available in your country') + def _extract_track(self, track, track_id=None): title = track['title'] track_url = self._decrypt(track['location']) @@ -81,7 +86,8 @@ class XiamiSongIE(XiamiBaseIE): 'ext': 'lrc', }], }, - } + }, + 'skip': 'Georestricted', }, { 'url': 'http://www.xiami.com/song/1775256504', 'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc', @@ -100,7 +106,8 @@ class XiamiSongIE(XiamiBaseIE): 'ext': 'lrc', }], }, - } + }, + 'skip': 'Georestricted', }] def _real_extract(self, url): @@ -124,6 +131,7 @@ class XiamiAlbumIE(XiamiPlaylistBaseIE): 'id': '2100300444', }, 'playlist_count': 10, + 'skip': 'Georestricted', }, { 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9', 'only_matching': True, @@ -141,6 +149,7 @@ class XiamiArtistIE(XiamiPlaylistBaseIE): 'id': '2132', }, 'playlist_count': 20, + 'skip': 'Georestricted', } @@ -155,4 +164,5 @@ class XiamiCollectionIE(XiamiPlaylistBaseIE): 'id': '156527391', }, 'playlist_mincount': 29, + 'skip': 'Georestricted', } diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index ce3723b55..0f78466e6 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -10,8 +10,6 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, - sanitized_Request, - urlencode_postdata, ) @@ -177,7 +175,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:playlist' IE_DESC = 'Яндекс.Музыка - Плейлист' - _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)' + _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)' _TESTS = [{ 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', @@ -196,47 +194,64 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'id': '1036', 'title': 'Музыка 90-х', }, - 'playlist_count': 310, + 'playlist_mincount': 300, 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + user = mobj.group('user') + playlist_id = mobj.group('id') - webpage = self._download_webpage(url, playlist_id) + playlist = self._download_json( + 'https://music.yandex.%s/handlers/playlist.jsx' % tld, + playlist_id, 'Downloading missing tracks JSON', + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + 'X-Retpath-Y': url, + }, + query={ + 'owner': user, + 'kinds': playlist_id, + 'light': 'true', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + })['playlist'] - mu = self._parse_json( - self._search_regex( - r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'), - playlist_id) + tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) - playlist = mu['pageData']['playlist'] - tracks, track_ids = playlist['tracks'], playlist['trackIds'] - - # tracks dictionary shipped with webpage is limited to 150 tracks, + # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. if len(tracks) < len(track_ids): - present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) - missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) - request = sanitized_Request( - 'https://music.yandex.ru/handlers/track-entries.jsx', - urlencode_postdata({ - 'entries': ','.join(missing_track_ids), - 'lang': mu.get('settings', {}).get('lang', 'en'), - 'external-domain': 'music.yandex.ru', - 'overembed': 'false', - 'sign': mu.get('authData', {}).get('user', {}).get('sign'), - 'strict': 'true', - })) - request.add_header('Referer', url) - request.add_header('X-Requested-With', 'XMLHttpRequest') - + present_track_ids = set([ + compat_str(track['id']) + for track in tracks if track.get('id')]) + missing_track_ids = [ + track_id for track_id in track_ids + if track_id not in present_track_ids] missing_tracks = self._download_json( - request, playlist_id, 'Downloading missing tracks JSON', fatal=False) + 'https://music.yandex.%s/handlers/track-entries.jsx' % tld, + playlist_id, 'Downloading missing tracks JSON', + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }, + query={ + 'entries': ','.join(missing_track_ids), + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + 'strict': 'true', + }) if missing_tracks: tracks.extend(missing_tracks) return self.playlist_result( self._build_playlist(tracks), compat_str(playlist_id), - playlist['title'], playlist.get('description')) + playlist.get('title'), playlist.get('description')) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b7c3cb63f..f3f102c30 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1326,9 +1326,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if video_description: video_description = re.sub(r'''(?x) <a\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? + (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? + (?:[a-zA-Z-]+="[^"]*"\s+)*? class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*> [^<]+\.{3}\s* </a> diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 1793a878c..fa99b0c2a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -389,23 +389,30 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): class FFmpegMetadataPP(FFmpegPostProcessor): def run(self, info): metadata = {} - if info.get('title') is not None: - metadata['title'] = info['title'] - if info.get('upload_date') is not None: - metadata['date'] = info['upload_date'] - if info.get('artist') is not None: - metadata['artist'] = info['artist'] - elif info.get('uploader') is not None: - metadata['artist'] = info['uploader'] - elif info.get('uploader_id') is not None: - metadata['artist'] = info['uploader_id'] - if info.get('description') is not None: - metadata['description'] = info['description'] - metadata['comment'] = info['description'] - if info.get('webpage_url') is not None: - metadata['purl'] = info['webpage_url'] - if info.get('album') is not None: - metadata['album'] = info['album'] + + def add(meta_list, info_list=None): + if not info_list: + info_list = meta_list + if not isinstance(meta_list, (list, tuple)): + meta_list = (meta_list,) + if not isinstance(info_list, (list, tuple)): + info_list = (info_list,) + for info_f in info_list: + if info.get(info_f) is not None: + for meta_f in meta_list: + metadata[meta_f] = info[info_f] + break + + add('title', ('track', 'title')) + add('date', 'upload_date') + add(('description', 'comment'), 'description') + add('purl', 'webpage_url') + add('track', 'track_number') + add('artist', ('artist', 'creator', 'uploader', 'uploader_id')) + add('genre') + add('album') + add('album_artist') + add('disc', 'disc_number') if not metadata: self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7bcc85e2b..6e4573784 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -14,8 +14,8 @@ import email.utils import errno import functools import gzip -import itertools import io +import itertools import json import locale import math @@ -24,8 +24,8 @@ import os import pipes import platform import re -import ssl import socket +import ssl import struct import subprocess import sys @@ -89,6 +89,11 @@ KNOWN_EXTENSIONS = ( 'wav', 'f4f', 'f4m', 'm3u8', 'smil') +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) + def preferredencoding(): """Get preferred encoding. @@ -251,9 +256,9 @@ def get_element_by_attribute(attribute, value, html): m = re.search(r'''(?xs) <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s*> (?P<content>.*?) </\1> @@ -365,6 +370,8 @@ def sanitize_filename(s, restricted=False, is_id=False): Set is_id if this is not an arbitrary string, but an ID that should be kept if possible """ def replace_insane(char): + if restricted and char in ACCENT_CHARS: + return ACCENT_CHARS[char] if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8befd9607..551160897 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.04.24' +__version__ = '2016.05.01'