Anisse Astier e235270df4 [travis] Add new regression detection script as default test suite
It runs tests and parses nosetests output to detect failures and test
them for regressions against a reference version. If it finds a
regression, it is automatically bisected.

Unstable or flaky tests are detected and ignored automatically by
running them multiple times in a row.

We keep the original test suite around, but mark it as allowed to fail.
It serves as a dashboard of current test statuses, but since the test
can fail out of our control (this is the essence of this project), we
don't want it to be blocking.

Using the regression detection as a fail means that any failing build
need to be examined. Even if the next build is "fixed", it does not mean
that the regression has been fixed. This is a change in semantics when
analyzing build history.

We map/reduce by splitting the test suite in 7 parts by abusing travis'
matrix feature. Reduce is done by hand, by analyzing the dashboard of
travis, any failing test being critical.

Because nosetests --processes just doesn't work with youtube-dl test
suite (yet), we route around it by first enumerating tests. This is a
bit long because nosetests needs to find and run all tests files in
order to enumerate them, but it should be at most 30 seconds, while the
test suite can take more than 2 hours on travis' infrastructure.

By doing this, we ensure we'll be able to run the tests faster, since
they are mostly I/O (network) bound due to the nature of the project.

Closes #8496
2016-09-27 14:00:23 +02:00

239 lines
7.9 KiB
Python
Executable File

#!/usr/bin/env python3
from __future__ import print_function, unicode_literals
import subprocess
import sys
import os
import time
NOSECOMMAND="nosetests"
def process(test):
# Parse nose output to get test statuses
# Format: test_<test_name> (<test_path>) ... [multi-line-error-message?] {ok,ERROR,FAIL}
if not test or len(test) == 0:
print("Received empty line")
return (None, None)
s = test.split()
if len(s) < 4:
print("Bad line passed, not enough elements:", test)
# We try to convert this nose output:
# test_opengraph (test.test_InfoExtractor.TestInfoExtractor) ... ok
# into this test handle:
# test.test_InfoExtractor:TestInfoExtractor.test_opengraph
testpath = s[1][1:-1].split('.')
if len(testpath) != 3:
print("Bad testpath passed, not enough elements:", testpath)
return (None, None)
fulltestname = "%s.%s:%s.%s"%(testpath[0], testpath[1], testpath[2], s[0])
status = s[-1]
if status not in ("ok", "FAIL", "ERROR"):
print("Unknown test status", status)
return (None, None)
# we cannot assume that a test failing with a warning is ok (network error)
if status == "ok" and test.find("WARNING") != -1:
status = "WARNING"
return (fulltestname, status)
def fill_results(res, results):
if res[0] != None and res[1] != None:
results[res[0]] = res[1]
def process_stream(f, verbose):
results = {}
buf = None
for line in f:
if verbose: print(line, end='')
if line.startswith("===========") or line.startswith("--------------"):
#this is the end
break
if line.startswith("test_"): #new test, process previous test
if buf != None:
fill_results(process(buf), results) # for every other element this signals the beginning of a new one
buf = line
else:
if buf and len(buf) > 0: # some tests have multi-line outputs
buf += line
if verbose: # print the end of the file
for line in f: # it might contain interesting info, like tracebacks
print(line, end='')
fill_results(process(buf), results) # process last line
return results
def launch_nose(args=[], verbose=True):
nose = subprocess.Popen([NOSECOMMAND, "-v"] + args, stderr=subprocess.PIPE, universal_newlines=True)
results = process_stream(nose.stderr, verbose)
nose.stderr.close()
nose.wait()
return results
def filter_bad(results):
# Filter failing/error tests
redo = {}
for k in results.keys():
if results[k] != "ok":
redo[k] = results[k]
return list(redo.keys())
def test_stability(refcommit, testcommit, failed_tests):
git_checkout(refcommit)
print("Testing at commit " + refcommit)
refresults = launch_nose(failed_tests)
stable_tests = []
for k in refresults:
if refresults[k] != "ok":
print("Test %s is unreliable !"%(k))
else:
stable_tests.append(k)
git_checkout(testcommit)
print("Back to commit " + testcommit)
return stable_tests
def iterate_tests(refcommit, testcommit, testlist=[], iterations=9, cooldown=60):
failed_tests=testlist # empty means run all tests
# run tests passed in arguments (or all) and get list of failed tests
# keep running those tests a few times to make sure the failure wasn't
# temporary (bad connection, site error, ...)
for i in range(iterations):
if i > 3 and len(failed_tests) < 5:
# We have reduced the number of tests, we now test them for stability
print("We only have %d tests at iteration %d, testing for reliablity"%(len(failed_tests), i))
failed_tests = test_stability(refcommit, testcommit, failed_tests)
if len(failed_tests) == 0: # no more stable tests
return {}
time.sleep(cooldown)
results = launch_nose(failed_tests)
failed_tests = filter_bad(results)
print("Run %d done. Has %d out of %d non-ok tests"%(i, len(failed_tests), len(results.keys())))
if len(failed_tests) == 0: # no failure. Awesome !
break
time.sleep(cooldown)
return results # this will return a partial result list. It does not matter since ok-tests aren't that interesting
def git_checkout(arg):
ret = subprocess.call(["git", "checkout", "--quiet", arg])
if ret != 0:
raise RuntimeError("git checkout failed")
def regressive_tests(refresults, testresults):
regressive = []
# Return list of tests that are ok in refresults but not in testresults
for k in refresults:
assert k in testresults, "New unknown test case"
if refresults[k] == "ok" and refresults[k] != testresults[k]: #let's assume FAIL == ERROR
regressive.append(k)
return regressive
def list_nose_tests():
tests = sorted(launch_nose(["--collect-only"], verbose=False).keys())
return tests
def sub_tests():
# See if we need to slice the work and do only one part
slice_arg = os.getenv("TESTS")
if slice_arg == None:
return None
test_slice = slice_arg.split('_')[1]
slice_bounds = test_slice.split('-of-')
current_slice = int(slice_bounds[0])
nr_slices = int(slice_bounds[1])
all_tests = list_nose_tests()
length = len(all_tests)
sub_test_list = all_tests[(current_slice-1)*length // nr_slices : \
current_slice*length // nr_slices]
print("Running slice %d of %d; it has %d out of %d tests"%(current_slice,
nr_slices, len(sub_test_list), length))
return sub_test_list
def bisect(good, bad, test):
def git_bisect(args):
ret = subprocess.call(["git", "bisect"] + args)
if ret != 0:
raise RuntimeError("git bisect failed with " + " ".join(args))
print("Bisecting %s between %s and %s"%(test, good, bad))
git_bisect(["start", bad, good])
git_bisect(["run", NOSECOMMAND, "--verbose", "--detailed-errors", test])
git_bisect(["reset"])
print("Bisect done")
def main():
if len(sys.argv) < 3:
commit_range = os.getenv("TRAVIS_COMMIT_RANGE")
if commit_range != None:
commits = commit_range.split("...")
refcommit, testcommit = commits[0], commits[1]
else:
testcommit="master"
refcommit="master^"
else:
testcommit=sys.argv[1]
refcommit=sys.argv[2]
print("Testing if commit-ish %s introduced regressions compared to %s"%(testcommit, refcommit))
git_checkout(testcommit)
sub_test_list = sub_tests()
if sub_test_list != None:
args = sub_test_list
else:
args = sys.argv[3:] # use remaining args to limit test selection (if there are any)
results = launch_nose(args)
failed_tests = filter_bad(results)
if len(failed_tests) == 0:
print("No failure, exiting")
sys.exit(0)
print("%d tests are failing at %s, now testing if they are regression from %s" %
(len(failed_tests), testcommit, refcommit))
git_checkout(refcommit)
results_ref = launch_nose(failed_tests)
print("Second run of %d tests done."%len(failed_tests))
regressive = regressive_tests(results_ref, results)
git_checkout(testcommit)
if len(regressive) == 0:
print("There was no detected regression")
sys.exit(0)
print("%d test(s) have a potential regression. Retrying them a few times to be sure"%len(regressive))
results_retry = iterate_tests(refcommit, testcommit, regressive)
failed_retry = filter_bad(results_retry)
if len(failed_retry) == 0:
print("All false alarms, exiting")
sys.exit(0)
print("We have %d regressions"%len(failed_retry))
for k in failed_retry:
print("Test %s was %s in %s, is now %s at %s"%(k, results_ref[k],
refcommit, results_retry[k], testcommit))
bisect(refcommit, testcommit, k)
git_checkout(testcommit)
sys.exit(-1)
if __name__ == "__main__":
main()