Skip to content

Commit eb11fd5

Browse files
committed
Handle interrupted helm releases in applier
Introduces a workaround for 'interrupted' helm releases which enter into a 'pending' (-install/uninstall/rollback) state. If that happens, for example because of immediate application exit with one of those operations being in flight, helm is not able to resolve it automatically which means we end up in a permanent reconcile error state. One of the workarounds for this that has been repeated in the community is to remove metadata of the pending release, which is the solution chosen here. For full context see: #1776 helm/helm#5595
1 parent e69f1ec commit eb11fd5

File tree

7 files changed

+121
-5
lines changed

7 files changed

+121
-5
lines changed

cmd/operator-controller/main.go

+1
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,7 @@ func run() error {
411411
ActionClientGetter: acg,
412412
Preflights: preflights,
413413
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
414+
SystemNamespace: cfg.systemNamespace,
414415
}
415416

416417
cm := contentmanager.NewManager(clientRestConfigMapper, mgr.GetConfig(), mgr.GetRESTMapper())

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ require (
1818
github.com/opencontainers/go-digest v1.0.0
1919
github.com/opencontainers/image-spec v1.1.1
2020
github.com/operator-framework/api v0.30.0
21-
github.com/operator-framework/helm-operator-plugins v0.8.0
21+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b
2222
github.com/operator-framework/operator-registry v1.50.0
2323
github.com/prometheus/client_golang v1.21.1
2424
github.com/sirupsen/logrus v1.9.3

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -536,8 +536,8 @@ github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11 h1:eT
536536
github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11/go.mod h1:EmVJt97N+pfWFsli/ipXTBZqSG5F5KGQhm3c3IsGq1o=
537537
github.com/operator-framework/api v0.30.0 h1:44hCmGnEnZk/Miol5o44dhSldNH0EToQUG7vZTl29kk=
538538
github.com/operator-framework/api v0.30.0/go.mod h1:FYxAPhjtlXSAty/fbn5YJnFagt6SpJZJgFNNbvDe5W0=
539-
github.com/operator-framework/helm-operator-plugins v0.8.0 h1:0f6HOQC5likkf0b/OvGvw7nhDb6h8Cj5twdCNjwNzMc=
540-
github.com/operator-framework/helm-operator-plugins v0.8.0/go.mod h1:Sc+8bE38xTCgCChBUvtq/PxatEg9fAypr7S5iAw8nlA=
539+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b h1:andoOL7lpEafTXdjPGDuNLJlJQh0UmRzgj0+D31K8HU=
540+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b/go.mod h1:qioyxwOSI89RAtTWhccc+RyaPQdKTTJRc6sFkT6kqys=
541541
github.com/operator-framework/operator-lib v0.17.0 h1:cbz51wZ9+GpWR1ZYP4CSKSSBxDlWxmmnseaHVZZjZt4=
542542
github.com/operator-framework/operator-lib v0.17.0/go.mod h1:TGopBxIE8L6E/Cojzo26R3NFp1eNlqhQNmzqhOblaLw=
543543
github.com/operator-framework/operator-registry v1.50.0 h1:kMAwsKAEDjuSx5dGchMX+CD3SMHWwOAC/xyK3LQweB4=

internal/operator-controller/action/helm_test.go

+6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/stretchr/testify/assert"
1010
"github.com/stretchr/testify/mock"
1111
"github.com/stretchr/testify/require"
12+
"helm.sh/helm/v3/pkg/action"
1213
"helm.sh/helm/v3/pkg/chart"
1314
"helm.sh/helm/v3/pkg/release"
1415
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -70,6 +71,11 @@ func (m *mockActionClient) Reconcile(rel *release.Release) error {
7071
return args.Error(0)
7172
}
7273

74+
func (m *mockActionClient) Config() *action.Configuration {
75+
args := m.Called()
76+
return args.Get(0).(*action.Configuration)
77+
}
78+
7379
var _ actionclient.ActionClientGetter = &mockActionClientGetter{}
7480

7581
type mockActionClientGetter struct {

internal/operator-controller/applier/helm.go

+25-2
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ type BundleToHelmChartFn func(rv1 fs.FS, installNamespace string, watchNamespace
5656
type Helm struct {
5757
ActionClientGetter helmclient.ActionClientGetter
5858
Preflights []Preflight
59+
SystemNamespace string
5960
BundleToHelmChartFn BundleToHelmChartFn
6061
}
6162

@@ -95,7 +96,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
9596
labels: objectLabels,
9697
}
9798

98-
rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post)
99+
rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post)
99100
if err != nil {
100101
return nil, "", err
101102
}
@@ -164,9 +165,30 @@ func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*char
164165
return h.BundleToHelmChartFn(bundleFS, ext.Spec.Namespace, watchNamespace)
165166
}
166167

167-
func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
168+
func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
169+
logger := log.FromContext(ctx)
168170
currentRelease, err := cl.Get(ext.GetName())
171+
172+
// if a release is pending at this point, that means that a helm action
173+
// (installation/upgrade) we were attempting was likely interrupted in-flight.
174+
// Pending release would leave us in reconciliation error loop because helm
175+
// wouldn't be able to progress automatically from it.
176+
//
177+
// one of the workarounds is to try and remove helm metadata relating to
178+
// that pending release which should 'reset' its state communicated to helm
179+
// and the next reconciliation should be able to successfully pick up from here
180+
// for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476
181+
// and the discussion in https://github.com/operator-framework/operator-controller/pull/1776
182+
if err == nil && currentRelease.Info.Status.IsPending() {
183+
if _, err = cl.Config().Releases.Delete(currentRelease.Name, currentRelease.Version); err != nil {
184+
return nil, nil, StateError, fmt.Errorf("failed removing interrupted release %q version %d metadata: %w", currentRelease.Name, currentRelease.Version, err)
185+
}
186+
// return error to try to detect proper state (installation/upgrade) at next reconciliation
187+
return nil, nil, StateError, fmt.Errorf("removed interrupted release %q version %d metadata", currentRelease.Name, currentRelease.Version)
188+
}
189+
169190
if errors.Is(err, driver.ErrReleaseNotFound) {
191+
logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName())
170192
desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error {
171193
i.DryRun = true
172194
i.DryRunOption = "server"
@@ -186,6 +208,7 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE
186208
}
187209

188210
desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error {
211+
logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName())
189212
upgrade.MaxHistory = maxHelmReleaseHistory
190213
upgrade.DryRun = true
191214
upgrade.DryRunOption = "server"

internal/operator-controller/applier/helm_test.go

+81
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"helm.sh/helm/v3/pkg/action"
1414
"helm.sh/helm/v3/pkg/chart"
1515
"helm.sh/helm/v3/pkg/release"
16+
"helm.sh/helm/v3/pkg/storage"
1617
"helm.sh/helm/v3/pkg/storage/driver"
1718
corev1 "k8s.io/api/core/v1"
1819
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -50,6 +51,7 @@ type mockActionGetter struct {
5051
reconcileErr error
5152
desiredRel *release.Release
5253
currentRel *release.Release
54+
config *action.Configuration
5355
}
5456

5557
func (mag *mockActionGetter) ActionClientFor(ctx context.Context, obj client.Object) (helmclient.ActionInterface, error) {
@@ -98,6 +100,10 @@ func (mag *mockActionGetter) Reconcile(rel *release.Release) error {
98100
return mag.reconcileErr
99101
}
100102

103+
func (mag *mockActionGetter) Config() *action.Configuration {
104+
return mag.config
105+
}
106+
101107
var (
102108
// required for unmockable call to convert.RegistryV1ToHelmChart
103109
validFS = fstest.MapFS{
@@ -179,6 +185,80 @@ func TestApply_Base(t *testing.T) {
179185
})
180186
}
181187

188+
func TestApply_InterruptedRelease(t *testing.T) {
189+
t.Run("fails removing an interrupted install release", func(t *testing.T) {
190+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
191+
testStorage := storage.Init(driver.NewMemory())
192+
193+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
194+
helmApplier := applier.Helm{
195+
ActionClientGetter: mockAcg,
196+
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
197+
}
198+
199+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
200+
require.Error(t, err)
201+
require.ErrorContains(t, err, "removing interrupted release")
202+
require.Nil(t, objs)
203+
require.Empty(t, state)
204+
})
205+
206+
t.Run("fails removing an interrupted upgrade release", func(t *testing.T) {
207+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
208+
testStorage := storage.Init(driver.NewMemory())
209+
210+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
211+
helmApplier := applier.Helm{
212+
ActionClientGetter: mockAcg,
213+
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
214+
}
215+
216+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
217+
require.Error(t, err)
218+
require.ErrorContains(t, err, "removing interrupted release")
219+
require.Nil(t, objs)
220+
require.Empty(t, state)
221+
})
222+
223+
t.Run("successfully removes an interrupted install release", func(t *testing.T) {
224+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
225+
testStorage := storage.Init(driver.NewMemory())
226+
err := testStorage.Create(testRel)
227+
require.NoError(t, err)
228+
229+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
230+
helmApplier := applier.Helm{
231+
ActionClientGetter: mockAcg,
232+
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
233+
}
234+
235+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
236+
require.Error(t, err)
237+
require.ErrorContains(t, err, "removed interrupted release")
238+
require.Nil(t, objs)
239+
require.Empty(t, state)
240+
})
241+
242+
t.Run("successfully removes an interrupted upgrade release", func(t *testing.T) {
243+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
244+
testStorage := storage.Init(driver.NewMemory())
245+
err := testStorage.Create(testRel)
246+
require.NoError(t, err)
247+
248+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
249+
helmApplier := applier.Helm{
250+
ActionClientGetter: mockAcg,
251+
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
252+
}
253+
254+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
255+
require.Error(t, err)
256+
require.ErrorContains(t, err, "removed interrupted release")
257+
require.Nil(t, objs)
258+
require.Empty(t, state)
259+
})
260+
}
261+
182262
func TestApply_Installation(t *testing.T) {
183263
t.Run("fails during dry-run installation", func(t *testing.T) {
184264
mockAcg := &mockActionGetter{
@@ -340,6 +420,7 @@ func TestApply_Upgrade(t *testing.T) {
340420

341421
t.Run("fails during dry-run upgrade", func(t *testing.T) {
342422
mockAcg := &mockActionGetter{
423+
currentRel: testCurrentRelease,
343424
dryRunUpgradeErr: errors.New("failed attempting to dry-run upgrade chart"),
344425
}
345426
helmApplier := applier.Helm{

internal/operator-controller/controllers/clusterextension_controller_test.go

+5
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/google/go-cmp/cmp/cmpopts"
1313
"github.com/stretchr/testify/assert"
1414
"github.com/stretchr/testify/require"
15+
"helm.sh/helm/v3/pkg/action"
1516
"helm.sh/helm/v3/pkg/chart"
1617
"helm.sh/helm/v3/pkg/release"
1718
"helm.sh/helm/v3/pkg/storage/driver"
@@ -1411,6 +1412,10 @@ func (mag *MockActionGetter) Reconcile(rel *release.Release) error {
14111412
return nil
14121413
}
14131414

1415+
func (mag *MockActionGetter) Config() *action.Configuration {
1416+
return nil
1417+
}
1418+
14141419
func TestGetInstalledBundleHistory(t *testing.T) {
14151420
getter := controllers.DefaultInstalledBundleGetter{}
14161421

0 commit comments

Comments
 (0)