Skip to content

Commit 7d83d51

Browse files
craig[bot]srosenberg
craig[bot]
andcommitted
Merge #142788
142788: roachtest: deflake `gossip/restart-node-one` r=herkolategan a=srosenberg When running in _local_ mode, e.g., as part of the acceptance suite during PR check, the rate of flakiness of `gossip/restart-node-one` has notably increased. In majority of cases, it fails during post-test validations, namely the replica consistency check. It's surmised that a recent switch to "leader leases", coupled with other implementation details, may have resulted in the uptick (of flakes). The other details are the fact that n1 is stripped of all replicas, owing to the specifics of the test. And, the fact that the replica consistency check always preferred n1. Consequently, n1 may be temporarily unavailable upon restart, causing a timeout. This change forces `WaitForReplication` upon restart of n1. Further, it shuffles node health statuses so that every node in the cluster is equally likely to be chosen for the replica consistency check. Epic: none Release note: None Co-authored-by: Stan Rosenberg <[email protected]>
2 parents 5c17046 + 46a31d1 commit 7d83d51

File tree

3 files changed

+9
-2
lines changed

3 files changed

+9
-2
lines changed

pkg/cmd/roachtest/roachtestutil/validation_check.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func CheckReplicaDivergenceOnDB(ctx context.Context, l *logger.Logger, db *gosql
4444
// Speed up consistency checks. The test is done, so let's go full throttle.
4545
_, err := db.ExecContext(ctx, "SET CLUSTER SETTING server.consistency_check.max_rate = '1GB'")
4646
if err != nil {
47-
return err
47+
return errors.Wrap(err, "unable to set 'server.consistency_check.max_rate'")
4848
}
4949

5050
// NB: we set a statement_timeout since context cancellation won't work here.

pkg/cmd/roachtest/test_runner.go

+5
Original file line numberDiff line numberDiff line change
@@ -1524,6 +1524,11 @@ func (r *testRunner) postTestAssertions(
15241524
}
15251525

15261526
validationNode := 0
1527+
// Shuffle node statuses so that we don't always pick the same node for validation checks.
1528+
prng.Shuffle(len(statuses), func(i, j int) {
1529+
statuses[i], statuses[j] = statuses[j], statuses[i]
1530+
})
1531+
15271532
for _, s := range statuses {
15281533
if s.Err != nil {
15291534
t.L().Printf("n%d: %s error=%s", s.Node, s.URL, s.Err)

pkg/cmd/roachtest/tests/gossip.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,9 @@ SELECT count(replicas)
538538
// Stop our special snowflake process which won't be recognized by the test
539539
// harness, and start it again on the regular.
540540
c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(1))
541-
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.Node(1))
541+
// N.B. Since n1 was initially stripped of all the replicas, we must wait for full replication. Otherwise, the
542+
// replica consistency checks may time out.
543+
c.Start(ctx, t.L(), option.NewStartOpts(option.WaitForReplication()), install.MakeClusterSettings(), c.Node(1))
542544
}
543545

544546
func runCheckLocalityIPAddress(ctx context.Context, t test.Test, c cluster.Cluster) {

0 commit comments

Comments
 (0)