Skip to content

Commit da5beab

Browse files
Drop Node events when EC2 instance does not exist and node is not new
1 parent 08ac6f0 commit da5beab

File tree

4 files changed

+56
-7
lines changed

4 files changed

+56
-7
lines changed

pkg/controllers/tagging/tagging_controller.go

+19
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ const (
6363

6464
// The label for depicting total number of errors a work item encounter and fail
6565
errorsAfterRetriesExhaustedWorkItemErrorMetric = "errors_after_retries_exhausted"
66+
67+
// The period of time after Node creation to retry tagging due to eventual consistency of the CreateTags API.
68+
newNodeEventualConsistencyGracePeriod = time.Minute * 5
6669
)
6770

6871
// Controller is the controller implementation for tagging cluster resources.
@@ -292,6 +295,18 @@ func (tc *Controller) tagEc2Instance(node *v1.Node) error {
292295
err := tc.cloud.TagResource(string(instanceID), tc.tags)
293296

294297
if err != nil {
298+
if awsv1.IsAWSErrorInstanceNotFound(err) {
299+
// This can happen for two reasons.
300+
// 1. The CreateTags API is eventually consistent. In rare cases, a newly-created instance may not be taggable for a short period.
301+
// We will re-queue the event and retry.
302+
if isNodeWithinEventualConsistencyGracePeriod(node) {
303+
return fmt.Errorf("EC2 instance %s for node %s does not exist, but node is within eventual consistency grace period", instanceID, node.GetName())
304+
}
305+
// 2. The event in our workQueue is stale, and the instance no longer exists.
306+
// Tagging will never succeed, and the event should not be re-queued.
307+
klog.Infof("Skip tagging since EC2 instance %s for node %s does not exist", instanceID, node.GetName())
308+
return nil
309+
}
295310
klog.Errorf("Error in tagging EC2 instance %s for node %s, error: %v", instanceID, node.GetName(), err)
296311
return err
297312
}
@@ -380,3 +395,7 @@ func (tc *Controller) getChecksumOfTags() string {
380395
sort.Strings(tags)
381396
return fmt.Sprintf("%x", md5.Sum([]byte(strings.Join(tags, ","))))
382397
}
398+
399+
func isNodeWithinEventualConsistencyGracePeriod(node *v1.Node) bool {
400+
return time.Since(node.CreationTimestamp.Time) < newNodeEventualConsistencyGracePeriod
401+
}

pkg/controllers/tagging/tagging_controller_test.go

+33-4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ import (
1717
"bytes"
1818
"context"
1919
"flag"
20+
"os"
21+
"strings"
22+
"testing"
23+
"time"
24+
2025
v1 "k8s.io/api/core/v1"
2126
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2227
"k8s.io/client-go/informers"
@@ -25,10 +30,6 @@ import (
2530
"k8s.io/client-go/util/workqueue"
2631
awsv1 "k8s.io/cloud-provider-aws/pkg/providers/v1"
2732
"k8s.io/klog/v2"
28-
"os"
29-
"strings"
30-
"testing"
31-
"time"
3233
)
3334

3435
const TestClusterID = "clusterid.test"
@@ -162,6 +163,34 @@ func Test_NodesJoiningAndLeaving(t *testing.T) {
162163
toBeTagged: false,
163164
expectedMessages: []string{"Successfully untagged i-0001"},
164165
},
166+
{
167+
name: "node0 is recently created and the instance is not found the first 3 CreateTags attempts",
168+
currNode: &v1.Node{
169+
ObjectMeta: metav1.ObjectMeta{
170+
Name: "node0",
171+
CreationTimestamp: metav1.Now(),
172+
},
173+
Spec: v1.NodeSpec{
174+
ProviderID: "i-not-found-count-3-0001",
175+
},
176+
},
177+
toBeTagged: true,
178+
expectedMessages: []string{"Successfully tagged i-not-found-count-3", "node is within eventual consistency grace period"},
179+
},
180+
{
181+
name: "node0 is not recently created and the instance is not found",
182+
currNode: &v1.Node{
183+
ObjectMeta: metav1.ObjectMeta{
184+
Name: "node0",
185+
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
186+
},
187+
Spec: v1.NodeSpec{
188+
ProviderID: "i-not-found",
189+
},
190+
},
191+
toBeTagged: true,
192+
expectedMessages: []string{"Skip tagging since EC2 instance i-not-found for node node0 does not exist"},
193+
},
165194
}
166195

167196
awsServices := awsv1.NewFakeAWSServices(TestClusterID)

pkg/providers/v1/aws.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -1707,7 +1707,7 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin
17071707
instances, err := c.ec2.DescribeInstances(request)
17081708
if err != nil {
17091709
// if err is InstanceNotFound, return false with no error
1710-
if isAWSErrorInstanceNotFound(err) {
1710+
if IsAWSErrorInstanceNotFound(err) {
17111711
return false, nil
17121712
}
17131713
return false, err
@@ -1946,7 +1946,8 @@ func (c *Cloud) GetZoneByNodeName(ctx context.Context, nodeName types.NodeName)
19461946

19471947
}
19481948

1949-
func isAWSErrorInstanceNotFound(err error) bool {
1949+
// IsAWSErrorInstanceNotFound returns true if the specified error is an awserr.Error with the code `InvalidInstanceId.NotFound`.
1950+
func IsAWSErrorInstanceNotFound(err error) bool {
19501951
if err == nil {
19511952
return false
19521953
}

pkg/providers/v1/tags.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ func (c *Cloud) UntagResource(resourceID string, tags map[string]string) error {
344344
if err != nil {
345345
// An instance not found should not fail the untagging workflow as it
346346
// would for tagging, since the target state is already reached.
347-
if isAWSErrorInstanceNotFound(err) {
347+
if IsAWSErrorInstanceNotFound(err) {
348348
klog.Infof("Couldn't find resource when trying to untag it hence skipping it, %v", err)
349349
return nil
350350
}

0 commit comments

Comments
 (0)