Skip to content

Commit 44b5e31

Browse files
committed
Handle interrupted helm releases in applier
Introduces a workaround for 'interrupted' helm releases which enter into a 'pending' (-install/uninstall/rollback) state. If that happens, for example because of immediate application exit with one of those operations being in flight, helm is not able to resolve it automatically which means we end up in a permanent reconcile error state. One of the workarounds for this that has been repeated in the community is to remove metadata of the pending release, which is the solution chosen here. For full context see: #1776 helm/helm#5595
1 parent 9b888eb commit 44b5e31

File tree

7 files changed

+163
-27
lines changed

7 files changed

+163
-27
lines changed

cmd/operator-controller/main.go

+4-3
Original file line numberDiff line numberDiff line change
@@ -407,9 +407,10 @@ func run() error {
407407
crdupgradesafety.NewPreflight(aeClient.CustomResourceDefinitions()),
408408
}
409409

410-
helmApplier := &applier.Helm{
411-
ActionClientGetter: acg,
412-
Preflights: preflights,
410+
helmApplier, err := applier.NewHelm(acg, preflights, cfg.systemNamespace)
411+
if err != nil {
412+
setupLog.Error(err, "unable to create helm applier")
413+
return err
413414
}
414415

415416
cm := contentmanager.NewManager(clientRestConfigMapper, mgr.GetConfig(), mgr.GetRESTMapper())

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ require (
1818
github.com/opencontainers/go-digest v1.0.0
1919
github.com/opencontainers/image-spec v1.1.0
2020
github.com/operator-framework/api v0.29.0
21-
github.com/operator-framework/helm-operator-plugins v0.8.0
21+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b
2222
github.com/operator-framework/operator-registry v1.50.0
2323
github.com/prometheus/client_golang v1.21.0
2424
github.com/sirupsen/logrus v1.9.3

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -536,8 +536,8 @@ github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11 h1:eT
536536
github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11/go.mod h1:EmVJt97N+pfWFsli/ipXTBZqSG5F5KGQhm3c3IsGq1o=
537537
github.com/operator-framework/api v0.29.0 h1:TxAR8RCO+I4FjRrY4PSMgnlmbxNWeD8pzHXp7xwHNmw=
538538
github.com/operator-framework/api v0.29.0/go.mod h1:0whQE4mpMDd2zyHkQe+bFa3DLoRs6oGWCbu8dY/3pyc=
539-
github.com/operator-framework/helm-operator-plugins v0.8.0 h1:0f6HOQC5likkf0b/OvGvw7nhDb6h8Cj5twdCNjwNzMc=
540-
github.com/operator-framework/helm-operator-plugins v0.8.0/go.mod h1:Sc+8bE38xTCgCChBUvtq/PxatEg9fAypr7S5iAw8nlA=
539+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b h1:andoOL7lpEafTXdjPGDuNLJlJQh0UmRzgj0+D31K8HU=
540+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b/go.mod h1:qioyxwOSI89RAtTWhccc+RyaPQdKTTJRc6sFkT6kqys=
541541
github.com/operator-framework/operator-lib v0.17.0 h1:cbz51wZ9+GpWR1ZYP4CSKSSBxDlWxmmnseaHVZZjZt4=
542542
github.com/operator-framework/operator-lib v0.17.0/go.mod h1:TGopBxIE8L6E/Cojzo26R3NFp1eNlqhQNmzqhOblaLw=
543543
github.com/operator-framework/operator-registry v1.50.0 h1:kMAwsKAEDjuSx5dGchMX+CD3SMHWwOAC/xyK3LQweB4=

internal/operator-controller/action/helm_test.go

+6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/stretchr/testify/assert"
1010
"github.com/stretchr/testify/mock"
1111
"github.com/stretchr/testify/require"
12+
"helm.sh/helm/v3/pkg/action"
1213
"helm.sh/helm/v3/pkg/chart"
1314
"helm.sh/helm/v3/pkg/release"
1415
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -70,6 +71,11 @@ func (m *mockActionClient) Reconcile(rel *release.Release) error {
7071
return args.Error(0)
7172
}
7273

74+
func (m *mockActionClient) Config() *action.Configuration {
75+
args := m.Called()
76+
return args.Get(0).(*action.Configuration)
77+
}
78+
7379
var _ actionclient.ActionClientGetter = &mockActionClientGetter{}
7480

7581
type mockActionClientGetter struct {

internal/operator-controller/applier/helm.go

+41-6
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,21 @@ type Preflight interface {
5454
}
5555

5656
type Helm struct {
57-
ActionClientGetter helmclient.ActionClientGetter
58-
Preflights []Preflight
57+
actionClientGetter helmclient.ActionClientGetter
58+
preflights []Preflight
59+
systemNamespace string
60+
}
61+
62+
func NewHelm(acg helmclient.ActionClientGetter, preflights []Preflight, systemNamespace string) (*Helm, error) {
63+
if acg == nil {
64+
return nil, fmt.Errorf("action client getter is nil")
65+
}
66+
67+
return &Helm{
68+
actionClientGetter: acg,
69+
preflights: preflights,
70+
systemNamespace: systemNamespace,
71+
}, nil
5972
}
6073

6174
// shouldSkipPreflight is a helper to determine if the preflight check is CRDUpgradeSafety AND
@@ -85,7 +98,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
8598
}
8699
values := chartutil.Values{}
87100

88-
ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext)
101+
ac, err := h.actionClientGetter.ActionClientFor(ctx, ext)
89102
if err != nil {
90103
return nil, "", err
91104
}
@@ -94,12 +107,12 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
94107
labels: objectLabels,
95108
}
96109

97-
rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post)
110+
rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post)
98111
if err != nil {
99112
return nil, "", err
100113
}
101114

102-
for _, preflight := range h.Preflights {
115+
for _, preflight := range h.preflights {
103116
if shouldSkipPreflight(ctx, preflight, ext, state) {
104117
continue
105118
}
@@ -152,9 +165,30 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
152165
return relObjects, state, nil
153166
}
154167

155-
func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
168+
func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
169+
logger := log.FromContext(ctx)
156170
currentRelease, err := cl.Get(ext.GetName())
171+
172+
// if a release is pending at this point, that means that a helm action
173+
// (installation/upgrade) we were attempting was likely interrupted in-flight.
174+
// Pending release would leave us in reconciliation error loop because helm
175+
// wouldn't be able to progress automatically from it.
176+
//
177+
// one of the workarounds is to try and remove helm metadata relating to
178+
// that pending release which should 'reset' its state communicated to helm
179+
// and the next reconciliation should be able to successfully pick up from here
180+
// for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476
181+
// and the discussion in https://github.com/operator-framework/operator-controller/pull/1776
182+
if err == nil && currentRelease.Info.Status.IsPending() {
183+
if _, err = cl.Config().Releases.Delete(currentRelease.Name, currentRelease.Version); err != nil {
184+
return nil, nil, StateError, fmt.Errorf("failed removing interrupted release %q version %d metadata: %w", currentRelease.Name, currentRelease.Version, err)
185+
}
186+
// return error to try to detect proper state (installation/upgrade) at next reconciliation
187+
return nil, nil, StateError, fmt.Errorf("removed interrupted release %q version %d metadata", currentRelease.Name, currentRelease.Version)
188+
}
189+
157190
if errors.Is(err, driver.ErrReleaseNotFound) {
191+
logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName())
158192
desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error {
159193
i.DryRun = true
160194
i.DryRunOption = "server"
@@ -174,6 +208,7 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE
174208
}
175209

176210
desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error {
211+
logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName())
177212
upgrade.MaxHistory = maxHelmReleaseHistory
178213
upgrade.DryRun = true
179214
upgrade.DryRunOption = "server"

internal/operator-controller/applier/helm_test.go

+104-15
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"helm.sh/helm/v3/pkg/action"
1313
"helm.sh/helm/v3/pkg/chart"
1414
"helm.sh/helm/v3/pkg/release"
15+
"helm.sh/helm/v3/pkg/storage"
1516
"helm.sh/helm/v3/pkg/storage/driver"
1617
featuregatetesting "k8s.io/component-base/featuregate/testing"
1718
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -46,6 +47,7 @@ type mockActionGetter struct {
4647
reconcileErr error
4748
desiredRel *release.Release
4849
currentRel *release.Release
50+
config *action.Configuration
4951
}
5052

5153
func (mag *mockActionGetter) ActionClientFor(ctx context.Context, obj client.Object) (helmclient.ActionInterface, error) {
@@ -94,6 +96,12 @@ func (mag *mockActionGetter) Reconcile(rel *release.Release) error {
9496
return mag.reconcileErr
9597
}
9698

99+
func (mag *mockActionGetter) Config() *action.Configuration {
100+
// TODO
101+
// storage.Init(driver.NewMemory()).
102+
return mag.config
103+
}
104+
97105
var (
98106
// required for unmockable call to convert.RegistryV1ToHelmChart
99107
validFS = fstest.MapFS{
@@ -144,7 +152,8 @@ func TestApply_Base(t *testing.T) {
144152

145153
t.Run("fails trying to obtain an action client", func(t *testing.T) {
146154
mockAcg := &mockActionGetter{actionClientForErr: errors.New("failed getting action client")}
147-
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
155+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
156+
require.NoError(t, err)
148157

149158
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
150159
require.Error(t, err)
@@ -155,7 +164,8 @@ func TestApply_Base(t *testing.T) {
155164

156165
t.Run("fails getting current release and !driver.ErrReleaseNotFound", func(t *testing.T) {
157166
mockAcg := &mockActionGetter{getClientErr: errors.New("failed getting current release")}
158-
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
167+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
168+
require.NoError(t, err)
159169

160170
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
161171
require.Error(t, err)
@@ -165,13 +175,80 @@ func TestApply_Base(t *testing.T) {
165175
})
166176
}
167177

178+
func TestApply_InterruptedRelease(t *testing.T) {
179+
t.Run("fails removing an interrupted install release", func(t *testing.T) {
180+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
181+
testStorage := storage.Init(driver.NewMemory())
182+
183+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
184+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
185+
require.NoError(t, err)
186+
187+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
188+
require.Error(t, err)
189+
require.ErrorContains(t, err, "removing interrupted release")
190+
require.Nil(t, objs)
191+
require.Empty(t, state)
192+
})
193+
194+
t.Run("fails removing an interrupted upgrade release", func(t *testing.T) {
195+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
196+
testStorage := storage.Init(driver.NewMemory())
197+
198+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
199+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
200+
require.NoError(t, err)
201+
202+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
203+
require.Error(t, err)
204+
require.ErrorContains(t, err, "removing interrupted release")
205+
require.Nil(t, objs)
206+
require.Empty(t, state)
207+
})
208+
209+
t.Run("successfully removes an interrupted install release", func(t *testing.T) {
210+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
211+
testStorage := storage.Init(driver.NewMemory())
212+
err := testStorage.Create(testRel)
213+
require.NoError(t, err)
214+
215+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
216+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
217+
require.NoError(t, err)
218+
219+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
220+
require.Error(t, err)
221+
require.ErrorContains(t, err, "removed interrupted release")
222+
require.Nil(t, objs)
223+
require.Empty(t, state)
224+
})
225+
226+
t.Run("successfully removes an interrupted upgrade release", func(t *testing.T) {
227+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
228+
testStorage := storage.Init(driver.NewMemory())
229+
err := testStorage.Create(testRel)
230+
require.NoError(t, err)
231+
232+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
233+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
234+
require.NoError(t, err)
235+
236+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
237+
require.Error(t, err)
238+
require.ErrorContains(t, err, "removed interrupted release")
239+
require.Nil(t, objs)
240+
require.Empty(t, state)
241+
})
242+
}
243+
168244
func TestApply_Installation(t *testing.T) {
169245
t.Run("fails during dry-run installation", func(t *testing.T) {
170246
mockAcg := &mockActionGetter{
171247
getClientErr: driver.ErrReleaseNotFound,
172248
dryRunInstallErr: errors.New("failed attempting to dry-run install chart"),
173249
}
174-
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
250+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
251+
require.NoError(t, err)
175252

176253
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
177254
require.Error(t, err)
@@ -186,7 +263,8 @@ func TestApply_Installation(t *testing.T) {
186263
installErr: errors.New("failed installing chart"),
187264
}
188265
mockPf := &mockPreflight{installErr: errors.New("failed during install pre-flight check")}
189-
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
266+
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
267+
require.NoError(t, err)
190268

191269
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
192270
require.Error(t, err)
@@ -200,7 +278,8 @@ func TestApply_Installation(t *testing.T) {
200278
getClientErr: driver.ErrReleaseNotFound,
201279
installErr: errors.New("failed installing chart"),
202280
}
203-
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
281+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
282+
require.NoError(t, err)
204283

205284
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
206285
require.Error(t, err)
@@ -217,7 +296,8 @@ func TestApply_Installation(t *testing.T) {
217296
Manifest: validManifest,
218297
},
219298
}
220-
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
299+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
300+
require.NoError(t, err)
221301

222302
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
223303
require.NoError(t, err)
@@ -236,7 +316,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
236316
getClientErr: driver.ErrReleaseNotFound,
237317
dryRunInstallErr: errors.New("failed attempting to dry-run install chart"),
238318
}
239-
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
319+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
320+
require.NoError(t, err)
240321

241322
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
242323
require.Error(t, err)
@@ -251,7 +332,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
251332
installErr: errors.New("failed installing chart"),
252333
}
253334
mockPf := &mockPreflight{installErr: errors.New("failed during install pre-flight check")}
254-
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
335+
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
336+
require.NoError(t, err)
255337

256338
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
257339
require.Error(t, err)
@@ -265,7 +347,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
265347
getClientErr: driver.ErrReleaseNotFound,
266348
installErr: errors.New("failed installing chart"),
267349
}
268-
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
350+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
351+
require.NoError(t, err)
269352

270353
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
271354
require.Error(t, err)
@@ -282,7 +365,8 @@ func TestApply_InstallationWithPreflightPermissionsEnabled(t *testing.T) {
282365
Manifest: validManifest,
283366
},
284367
}
285-
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
368+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
369+
require.NoError(t, err)
286370

287371
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
288372
require.NoError(t, err)
@@ -302,7 +386,8 @@ func TestApply_Upgrade(t *testing.T) {
302386
mockAcg := &mockActionGetter{
303387
dryRunUpgradeErr: errors.New("failed attempting to dry-run upgrade chart"),
304388
}
305-
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
389+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
390+
require.NoError(t, err)
306391

307392
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
308393
require.Error(t, err)
@@ -321,7 +406,8 @@ func TestApply_Upgrade(t *testing.T) {
321406
desiredRel: &testDesiredRelease,
322407
}
323408
mockPf := &mockPreflight{upgradeErr: errors.New("failed during upgrade pre-flight check")}
324-
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
409+
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
410+
require.NoError(t, err)
325411

326412
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
327413
require.Error(t, err)
@@ -340,7 +426,8 @@ func TestApply_Upgrade(t *testing.T) {
340426
desiredRel: &testDesiredRelease,
341427
}
342428
mockPf := &mockPreflight{}
343-
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
429+
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
430+
require.NoError(t, err)
344431

345432
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
346433
require.Error(t, err)
@@ -359,7 +446,8 @@ func TestApply_Upgrade(t *testing.T) {
359446
desiredRel: &testDesiredRelease,
360447
}
361448
mockPf := &mockPreflight{}
362-
helmApplier := applier.Helm{ActionClientGetter: mockAcg, Preflights: []applier.Preflight{mockPf}}
449+
helmApplier, err := applier.NewHelm(mockAcg, []applier.Preflight{mockPf}, "")
450+
require.NoError(t, err)
363451

364452
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
365453
require.Error(t, err)
@@ -376,7 +464,8 @@ func TestApply_Upgrade(t *testing.T) {
376464
currentRel: testCurrentRelease,
377465
desiredRel: &testDesiredRelease,
378466
}
379-
helmApplier := applier.Helm{ActionClientGetter: mockAcg}
467+
helmApplier, err := applier.NewHelm(mockAcg, nil, "")
468+
require.NoError(t, err)
380469

381470
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
382471
require.NoError(t, err)

0 commit comments

Comments
 (0)