Skip to content

Commit 3ac5841

Browse files
authored
neonvm-controller: replace failing reconciliation with per-VM failure interval (#949)
The old method had frequent false positive, because there might be a lot of intermittent failures, but overall the system does progress, and every particular VM is getting reconciled.
1 parent c78e4bc commit 3ac5841

File tree

9 files changed

+343
-38
lines changed

9 files changed

+343
-38
lines changed

neonvm/config/controller/deployment.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ spec:
5959
# * cache.direct=on - use O_DIRECT (don't abuse host's page cache!)
6060
# * cache.no-flush=on - ignores disk flush operations (not needed; our disks are ephemeral)
6161
- "--qemu-disk-cache-settings=cache.writeback=on,cache.direct=on,cache.no-flush=on"
62+
- "--failure-pending-period=1m"
63+
- "--failing-refresh-interval=15s"
6264
env:
6365
- name: NAD_IPAM_NAME
6466
value: $(NAD_IPAM_NAME)

neonvm/controllers/config.go

+10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package controllers
22

3+
import "time"
4+
35
// ReconcilerConfig stores shared configuration for VirtualMachineReconciler and
46
// VirtualMachineMigrationReconciler.
57
type ReconcilerConfig struct {
@@ -22,6 +24,14 @@ type ReconcilerConfig struct {
2224
// This field is passed to neonvm-runner as the `-qemu-disk-cache-settings` arg, and is directly
2325
// used in setting up the VM disks via QEMU's `-drive` flag.
2426
QEMUDiskCacheSettings string
27+
28+
// FailurePendingPeriod is the period for the propagation of
29+
// reconciliation failures to the observability instruments
30+
FailurePendingPeriod time.Duration
31+
32+
// FailingRefreshInterval is the interval between consecutive
33+
// updates of metrics and logs, related to failing reconciliations
34+
FailingRefreshInterval time.Duration
2535
}
2636

2737
func (c *ReconcilerConfig) criEndpointSocketPath() string {
+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
package failurelag
2+
3+
import (
4+
"sync"
5+
"time"
6+
)
7+
8+
// Tracker accumulates failure events for a given key and determines if
9+
// the key is degraded. The key becomes degraded if it receives only failures
10+
// over a configurable pending period. Once the success event is received, the key
11+
// is no longer considered degraded, and the pending period is reset.
12+
type Tracker[T comparable] struct {
13+
period time.Duration
14+
15+
pendingSince map[T]time.Time
16+
degraded map[T]struct{}
17+
degradeAt []degradeAt[T]
18+
19+
lock sync.Mutex
20+
Now func() time.Time
21+
}
22+
23+
type degradeAt[T comparable] struct {
24+
ts time.Time
25+
key T
26+
}
27+
28+
func NewTracker[T comparable](period time.Duration) *Tracker[T] {
29+
return &Tracker[T]{
30+
period: period,
31+
pendingSince: make(map[T]time.Time),
32+
degraded: make(map[T]struct{}),
33+
degradeAt: []degradeAt[T]{},
34+
lock: sync.Mutex{},
35+
Now: time.Now,
36+
}
37+
}
38+
39+
// forward processes all the fireAt events that are now in the past.
40+
func (t *Tracker[T]) forward(now time.Time) {
41+
i := 0
42+
for ; i < len(t.degradeAt); i++ {
43+
event := t.degradeAt[i]
44+
if event.ts.After(now) {
45+
break
46+
}
47+
pendingSince, ok := t.pendingSince[event.key]
48+
if !ok {
49+
// There was a success event in between
50+
continue
51+
}
52+
53+
if event.ts.Sub(pendingSince) < t.period {
54+
// There was a success, and another failure in between
55+
// We will have another fireAt event for this key in the future
56+
continue
57+
}
58+
t.degraded[event.key] = struct{}{}
59+
}
60+
t.degradeAt = t.degradeAt[i:]
61+
}
62+
63+
func (t *Tracker[T]) RecordSuccess(key T) {
64+
t.lock.Lock()
65+
defer t.lock.Unlock()
66+
67+
delete(t.degraded, key)
68+
delete(t.pendingSince, key)
69+
t.forward(t.Now())
70+
}
71+
72+
func (t *Tracker[T]) RecordFailure(key T) {
73+
t.lock.Lock()
74+
defer t.lock.Unlock()
75+
76+
now := t.Now()
77+
78+
if _, ok := t.pendingSince[key]; !ok {
79+
t.pendingSince[key] = now
80+
}
81+
82+
t.degradeAt = append(t.degradeAt, degradeAt[T]{
83+
ts: now.Add(t.period),
84+
key: key,
85+
})
86+
87+
t.forward(now)
88+
}
89+
90+
func (t *Tracker[T]) DegradedCount() int {
91+
t.lock.Lock()
92+
defer t.lock.Unlock()
93+
94+
t.forward(t.Now())
95+
return len(t.degraded)
96+
}
97+
98+
func (t *Tracker[T]) Degraded() []T {
99+
t.lock.Lock()
100+
defer t.lock.Unlock()
101+
102+
t.forward(t.Now())
103+
keys := make([]T, 0, len(t.degraded))
104+
for k := range t.degraded {
105+
keys = append(keys, k)
106+
}
107+
return keys
108+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
package failurelag_test
2+
3+
import (
4+
"testing"
5+
"time"
6+
7+
"github.com/stretchr/testify/assert"
8+
9+
"github.com/neondatabase/autoscaling/neonvm/controllers/failurelag"
10+
)
11+
12+
type nowMock struct {
13+
ts time.Time
14+
}
15+
16+
func (n *nowMock) Now() time.Time {
17+
return n.ts
18+
}
19+
20+
func (n *nowMock) Add(d time.Duration) {
21+
n.ts = n.ts.Add(d)
22+
}
23+
24+
func newNowMock() *nowMock {
25+
ts, _ := time.Parse("2006-01-02", "2024-01-01")
26+
return &nowMock{ts: ts}
27+
}
28+
29+
func TestTracker(t *testing.T) {
30+
now := newNowMock()
31+
tracker := failurelag.NewTracker[string](10 * time.Minute)
32+
tracker.Now = now.Now
33+
34+
// Alert fires after 15 minutes
35+
tracker.RecordFailure("key1")
36+
assert.Equal(t, tracker.DegradedCount(), 0)
37+
now.Add(15 * time.Minute)
38+
assert.Equal(t, tracker.DegradedCount(), 1)
39+
40+
// Alert no longer fires
41+
tracker.RecordSuccess("key1")
42+
assert.Equal(t, tracker.DegradedCount(), 0)
43+
}
44+
45+
func TestFailureSuccess(t *testing.T) {
46+
now := newNowMock()
47+
tracker := failurelag.NewTracker[string](10 * time.Minute)
48+
tracker.Now = now.Now
49+
50+
// Alert doesn't fire if there was a success in the interval
51+
tracker.RecordFailure("key1")
52+
53+
now.Add(5 * time.Minute)
54+
tracker.RecordSuccess("key1")
55+
56+
now.Add(10 * time.Minute)
57+
assert.Equal(t, tracker.DegradedCount(), 0)
58+
}
59+
60+
func TestFailureSuccessFailure(t *testing.T) {
61+
now := newNowMock()
62+
tracker := failurelag.NewTracker[string](10 * time.Minute)
63+
tracker.Now = now.Now
64+
65+
// Alert doesn't fire if there was success + failure in the interval
66+
tracker.RecordFailure("key1")
67+
68+
now.Add(5 * time.Minute)
69+
tracker.RecordSuccess("key1")
70+
71+
now.Add(1 * time.Minute)
72+
tracker.RecordFailure("key1")
73+
74+
now.Add(5 * time.Minute)
75+
assert.Equal(t, tracker.DegradedCount(), 0)
76+
77+
// But after 7 more minutes it does
78+
now.Add(7 * time.Minute)
79+
assert.Equal(t, tracker.DegradedCount(), 1)
80+
}
81+
82+
func TestMultipleKeys(t *testing.T) {
83+
now := newNowMock()
84+
tracker := failurelag.NewTracker[string](10 * time.Minute)
85+
tracker.Now = now.Now
86+
87+
// A combination of TestFailureSuccess and TestFailureSuccessFailure
88+
tracker.RecordFailure("key1")
89+
tracker.RecordFailure("key2")
90+
91+
now.Add(5 * time.Minute)
92+
tracker.RecordSuccess("key1")
93+
tracker.RecordSuccess("key2")
94+
95+
now.Add(1 * time.Minute)
96+
tracker.RecordFailure("key1")
97+
98+
now.Add(5 * time.Minute)
99+
assert.Equal(t, tracker.DegradedCount(), 0)
100+
101+
now.Add(7 * time.Minute)
102+
assert.Equal(t, tracker.DegradedCount(), 1)
103+
assert.Equal(t, tracker.Degraded(), []string{"key1"})
104+
105+
tracker.RecordFailure("key2")
106+
now.Add(15 * time.Minute)
107+
assert.Equal(t, tracker.DegradedCount(), 2)
108+
assert.Contains(t, tracker.Degraded(), "key1")
109+
assert.Contains(t, tracker.Degraded(), "key2")
110+
}

0 commit comments

Comments
 (0)