From 1ccb35feedf7fa0e271fa09dcc4bfee1d7da9660 Mon Sep 17 00:00:00 2001 From: Carter McKinnon Date: Mon, 27 Nov 2023 21:12:19 +0000 Subject: [PATCH] Drop Node events when EC2 instance does not exist and node is not new --- pkg/controllers/tagging/tagging_controller.go | 19 +++++++++++++++++++ pkg/providers/v1/aws.go | 5 +++-- pkg/providers/v1/tags.go | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pkg/controllers/tagging/tagging_controller.go b/pkg/controllers/tagging/tagging_controller.go index 3620212406d..89affd4d77d 100644 --- a/pkg/controllers/tagging/tagging_controller.go +++ b/pkg/controllers/tagging/tagging_controller.go @@ -63,6 +63,9 @@ const ( // The label for depicting total number of errors a work item encounter and fail errorsAfterRetriesExhaustedWorkItemErrorMetric = "errors_after_retries_exhausted" + + // The period of time after Node creation to retry tagging due to eventual consistency of the CreateTags API. + newNodeEventualConsistencyGracePeriod = time.Minute * 5 ) // Controller is the controller implementation for tagging cluster resources. @@ -292,6 +295,18 @@ func (tc *Controller) tagEc2Instance(node *v1.Node) error { err := tc.cloud.TagResource(string(instanceID), tc.tags) if err != nil { + if awsv1.IsAWSErrorInstanceNotFound(err) { + // This can happen for two reasons. + // 1. The CreateTags API is eventually consistent. In rare cases, a newly-created instance may not be taggable for a short period. + // We will re-queue the event and retry. + if isNodeWithinEventualConsistencyGracePeriod(node) { + return fmt.Errorf("EC2 instance %s for node %s does not exist, but node is within eventual consistency grace period", instanceID, node.GetName()) + } + // 2. The event in our workQueue is stale, and the instance no longer exists. + // Tagging will never succeed, and the event should not be re-queued. + klog.Infof("Skip tagging since EC2 instance %s for node %s does not exist", instanceID, node.GetName()) + return nil + } klog.Errorf("Error in tagging EC2 instance %s for node %s, error: %v", instanceID, node.GetName(), err) return err } @@ -380,3 +395,7 @@ func (tc *Controller) getChecksumOfTags() string { sort.Strings(tags) return fmt.Sprintf("%x", md5.Sum([]byte(strings.Join(tags, ",")))) } + +func isNodeWithinEventualConsistencyGracePeriod(node *v1.Node) bool { + return time.Since(node.CreationTimestamp.Time) < newNodeEventualConsistencyGracePeriod +} diff --git a/pkg/providers/v1/aws.go b/pkg/providers/v1/aws.go index e2212cd1054..2adc93ce9b4 100644 --- a/pkg/providers/v1/aws.go +++ b/pkg/providers/v1/aws.go @@ -1707,7 +1707,7 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin instances, err := c.ec2.DescribeInstances(request) if err != nil { // if err is InstanceNotFound, return false with no error - if isAWSErrorInstanceNotFound(err) { + if IsAWSErrorInstanceNotFound(err) { return false, nil } return false, err @@ -1946,7 +1946,8 @@ func (c *Cloud) GetZoneByNodeName(ctx context.Context, nodeName types.NodeName) } -func isAWSErrorInstanceNotFound(err error) bool { +// IsAWSErrorInstanceNotFound returns true if the specified error is an awserr.Error with the code `InvalidInstanceId.NotFound`. +func IsAWSErrorInstanceNotFound(err error) bool { if err == nil { return false } diff --git a/pkg/providers/v1/tags.go b/pkg/providers/v1/tags.go index beaaf0ca7df..9b824360970 100644 --- a/pkg/providers/v1/tags.go +++ b/pkg/providers/v1/tags.go @@ -344,7 +344,7 @@ func (c *Cloud) UntagResource(resourceID string, tags map[string]string) error { if err != nil { // An instance not found should not fail the untagging workflow as it // would for tagging, since the target state is already reached. - if isAWSErrorInstanceNotFound(err) { + if IsAWSErrorInstanceNotFound(err) { klog.Infof("Couldn't find resource when trying to untag it hence skipping it, %v", err) return nil }