Skip to content

Commit c073dfc

Browse files
committed
Handle interrupted helm releases in applier
1 parent 7f00b13 commit c073dfc

File tree

2 files changed

+69
-9
lines changed
  • cmd/operator-controller
  • internal/operator-controller/applier

2 files changed

+69
-9
lines changed

cmd/operator-controller/main.go

+4-3
Original file line numberDiff line numberDiff line change
@@ -407,9 +407,10 @@ func run() error {
407407
crdupgradesafety.NewPreflight(aeClient.CustomResourceDefinitions()),
408408
}
409409

410-
helmApplier := &applier.Helm{
411-
ActionClientGetter: acg,
412-
Preflights: preflights,
410+
helmApplier, err := applier.NewHelm(acg, coreClient, preflights, systemNamespace)
411+
if err != nil {
412+
setupLog.Error(err, "unable to create helm applier")
413+
os.Exit(1)
413414
}
414415

415416
cm := contentmanager.NewManager(clientRestConfigMapper, mgr.GetConfig(), mgr.GetRESTMapper())

internal/operator-controller/applier/helm.go

+65-6
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@ import (
1616
"helm.sh/helm/v3/pkg/release"
1717
"helm.sh/helm/v3/pkg/storage/driver"
1818
corev1 "k8s.io/api/core/v1"
19+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1920
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2021
apimachyaml "k8s.io/apimachinery/pkg/util/yaml"
22+
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
2123
"sigs.k8s.io/controller-runtime/pkg/client"
2224
"sigs.k8s.io/controller-runtime/pkg/log"
2325

@@ -36,6 +38,8 @@ const (
3638
StateUnchanged string = "Unchanged"
3739
StateError string = "Error"
3840
maxHelmReleaseHistory = 10
41+
42+
secretTypeIndexV1 = "type=operatorframework.io/index.v1"
3943
)
4044

4145
// Preflight is a check that should be run before making any changes to the cluster
@@ -54,8 +58,26 @@ type Preflight interface {
5458
}
5559

5660
type Helm struct {
57-
ActionClientGetter helmclient.ActionClientGetter
58-
Preflights []Preflight
61+
actionClientGetter helmclient.ActionClientGetter
62+
secretsClientGetter corev1client.SecretsGetter
63+
preflights []Preflight
64+
systemNamespace string
65+
}
66+
67+
func NewHelm(acg helmclient.ActionClientGetter, scg corev1client.SecretsGetter, preflights []Preflight, systemNamespace string) (*Helm, error) {
68+
if acg == nil {
69+
return nil, fmt.Errorf("action client getter is nil")
70+
}
71+
if scg == nil {
72+
return nil, fmt.Errorf("secrets client getter is nil")
73+
}
74+
75+
return &Helm{
76+
actionClientGetter: acg,
77+
secretsClientGetter: scg,
78+
preflights: preflights,
79+
systemNamespace: systemNamespace,
80+
}, nil
5981
}
6082

6183
// shouldSkipPreflight is a helper to determine if the preflight check is CRDUpgradeSafety AND
@@ -85,7 +107,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
85107
}
86108
values := chartutil.Values{}
87109

88-
ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext)
110+
ac, err := h.actionClientGetter.ActionClientFor(ctx, ext)
89111
if err != nil {
90112
return nil, "", err
91113
}
@@ -94,12 +116,12 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
94116
labels: objectLabels,
95117
}
96118

97-
rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post)
119+
rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post)
98120
if err != nil {
99121
return nil, "", err
100122
}
101123

102-
for _, preflight := range h.Preflights {
124+
for _, preflight := range h.preflights {
103125
if shouldSkipPreflight(ctx, preflight, ext, state) {
104126
continue
105127
}
@@ -152,9 +174,28 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
152174
return relObjects, state, nil
153175
}
154176

155-
func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
177+
func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
178+
logger := log.FromContext(ctx)
156179
currentRelease, err := cl.Get(ext.GetName())
180+
181+
// if a release is pending at this point, that means that a helm action
182+
// (installation/upgrade) we were attempting was likely interrupted in-flight.
183+
// Pending release would leave us in reconciliation error loop because helm
184+
// wouldn't be able to progress automatically from it.
185+
//
186+
// one of the workarounds is to try and remove all helm secrets relating to
187+
// that pending release which should 'reset' its state communicated to helm
188+
// and the next reconciliation should be able to successfully pick up from here
189+
// for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476
190+
if err == nil && currentRelease.Info.Status.IsPending() {
191+
logger.V(4).Info("ClusterExtension release pending", "extension", ext.GetName(), "release", currentRelease.Name)
192+
if err = h.deleteReleaseSecrets(ctx, currentRelease.Name); err != nil {
193+
return nil, nil, StateError, fmt.Errorf("failed deleting secrets for pending release %q: %w", currentRelease.Name, err)
194+
}
195+
}
196+
157197
if errors.Is(err, driver.ErrReleaseNotFound) {
198+
logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName())
158199
desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error {
159200
i.DryRun = true
160201
i.DryRunOption = "server"
@@ -174,6 +215,7 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE
174215
}
175216

176217
desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error {
218+
logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName())
177219
upgrade.MaxHistory = maxHelmReleaseHistory
178220
upgrade.DryRun = true
179221
upgrade.DryRunOption = "server"
@@ -220,3 +262,20 @@ func (p *postrenderer) Run(renderedManifests *bytes.Buffer) (*bytes.Buffer, erro
220262
}
221263
return &buf, nil
222264
}
265+
266+
func (h *Helm) deleteReleaseSecrets(ctx context.Context, releaseName string) error {
267+
return h.secretsClientGetter.Secrets(h.systemNamespace).DeleteCollection(
268+
ctx,
269+
metav1.DeleteOptions{},
270+
metav1.ListOptions{
271+
FieldSelector: secretTypeIndexV1,
272+
LabelSelector: fmt.Sprintf(
273+
"name in (%s),status in(%s, %s, %s)",
274+
releaseName,
275+
release.StatusPendingInstall,
276+
release.StatusPendingUpgrade,
277+
release.StatusPendingRollback,
278+
),
279+
},
280+
)
281+
}

0 commit comments

Comments
 (0)