@@ -22,6 +22,7 @@ import (
22
22
23
23
"golang.org/x/time/rate"
24
24
v1 "k8s.io/api/core/v1"
25
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
25
26
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
26
27
"k8s.io/apimachinery/pkg/util/wait"
27
28
coreinformers "k8s.io/client-go/informers/core/v1"
@@ -42,16 +43,21 @@ func init() {
42
43
registerMetrics ()
43
44
}
44
45
45
- // workItem contains the node and an action for that node
46
+ // taggingControllerNode contains the node details required for tag/untag of node resources.
47
+ type taggingControllerNode struct {
48
+ providerID string
49
+ name string
50
+ }
51
+
52
+ // workItem contains the node name, provider id and an action for that node.
46
53
type workItem struct {
47
- node * v1.Node
48
- action func (node * v1.Node ) error
49
- requeuingCount int
50
- enqueueTime time.Time
54
+ name string
55
+ providerID string
56
+ action string
51
57
}
52
58
53
59
func (w workItem ) String () string {
54
- return fmt .Sprintf ("[Node: %s, RequeuingCount : %d, EnqueueTime: % s]" , w .node . GetName () , w .requeuingCount , w . enqueueTime )
60
+ return fmt .Sprintf ("[Node: %s, Action : %s]" , w .name , w .action )
55
61
}
56
62
57
63
const (
@@ -62,17 +68,15 @@ const (
62
68
// The label for depicting total number of errors a work item encounter and succeed
63
69
totalErrorsWorkItemErrorMetric = "total_errors"
64
70
65
- // The label for depicting total time when work item gets queued to processed
66
- workItemProcessingTimeWorkItemMetric = "work_item_processing_time"
67
-
68
- // The label for depicting total time when work item gets queued to dequeued
69
- workItemDequeuingTimeWorkItemMetric = "work_item_dequeuing_time"
70
-
71
71
// The label for depicting total number of errors a work item encounter and fail
72
72
errorsAfterRetriesExhaustedWorkItemErrorMetric = "errors_after_retries_exhausted"
73
73
74
74
// The period of time after Node creation to retry tagging due to eventual consistency of the CreateTags API.
75
75
newNodeEventualConsistencyGracePeriod = time .Minute * 5
76
+
77
+ addTag = "ADD"
78
+
79
+ deleteTag = "DELETE"
76
80
)
77
81
78
82
// Controller is the controller implementation for tagging cluster resources.
@@ -152,7 +156,7 @@ func NewTaggingController(
152
156
tc .nodeInformer .Informer ().AddEventHandler (cache.ResourceEventHandlerFuncs {
153
157
AddFunc : func (obj interface {}) {
154
158
node := obj .(* v1.Node )
155
- tc .enqueueNode (node , tc . tagNodesResources )
159
+ tc .enqueueNode (node , addTag )
156
160
},
157
161
UpdateFunc : func (oldObj , newObj interface {}) {
158
162
node := newObj .(* v1.Node )
@@ -165,11 +169,11 @@ func NewTaggingController(
165
169
return
166
170
}
167
171
168
- tc .enqueueNode (node , tc . tagNodesResources )
172
+ tc .enqueueNode (node , addTag )
169
173
},
170
174
DeleteFunc : func (obj interface {}) {
171
175
node := obj .(* v1.Node )
172
- tc .enqueueNode (node , tc . untagNodeResources )
176
+ tc .enqueueNode (node , deleteTag )
173
177
},
174
178
})
175
179
@@ -215,21 +219,17 @@ func (tc *Controller) process() bool {
215
219
err := func (obj interface {}) error {
216
220
defer tc .workqueue .Done (obj )
217
221
218
- workItem , ok := obj .(* workItem )
222
+ workItem , ok := obj .(workItem )
219
223
if ! ok {
220
224
tc .workqueue .Forget (obj )
221
225
err := fmt .Errorf ("expected workItem in workqueue but got %s" , obj )
222
226
utilruntime .HandleError (err )
223
227
return nil
224
228
}
225
229
226
- timeTaken := time .Since (workItem .enqueueTime ).Seconds ()
227
- recordWorkItemLatencyMetrics (workItemDequeuingTimeWorkItemMetric , timeTaken )
228
- klog .Infof ("Dequeuing latency %f seconds" , timeTaken )
229
-
230
- instanceID , err := awsv1 .KubernetesInstanceID (workItem .node .Spec .ProviderID ).MapToAWSInstanceID ()
230
+ instanceID , err := awsv1 .KubernetesInstanceID (workItem .providerID ).MapToAWSInstanceID ()
231
231
if err != nil {
232
- err = fmt .Errorf ("Error in getting instanceID for node %s, error: %v" , workItem .node . GetName () , err )
232
+ err = fmt .Errorf ("error in getting instanceID for node %s, error: %v" , workItem .name , err )
233
233
utilruntime .HandleError (err )
234
234
return nil
235
235
}
@@ -241,26 +241,31 @@ func (tc *Controller) process() bool {
241
241
tc .workqueue .Forget (obj )
242
242
return nil
243
243
}
244
-
245
- err = workItem .action (workItem .node )
246
-
244
+ if workItem .action == addTag {
245
+ err = tc .tagNodesResources (& taggingControllerNode {
246
+ name : workItem .name ,
247
+ providerID : workItem .providerID ,
248
+ })
249
+ } else {
250
+ err = tc .untagNodeResources (& taggingControllerNode {
251
+ name : workItem .name ,
252
+ providerID : workItem .providerID ,
253
+ })
254
+ }
247
255
if err != nil {
248
- if workItem .requeuingCount < maxRequeuingCount {
256
+ numRetries := tc .workqueue .NumRequeues (workItem )
257
+ if numRetries < maxRequeuingCount {
249
258
// Put the item back on the workqueue to handle any transient errors.
250
- workItem .requeuingCount ++
251
259
tc .workqueue .AddRateLimited (workItem )
252
260
253
261
recordWorkItemErrorMetrics (totalErrorsWorkItemErrorMetric , string (instanceID ))
254
- return fmt .Errorf ("error processing work item '%v': %s, requeuing count %d" , workItem , err .Error (), workItem . requeuingCount )
262
+ return fmt .Errorf ("error processing work item '%v': %s, requeuing count %d" , workItem , err .Error (), numRetries )
255
263
}
256
264
257
265
klog .Errorf ("error processing work item %s: %s, requeuing count exceeded" , workItem , err .Error ())
258
266
recordWorkItemErrorMetrics (errorsAfterRetriesExhaustedWorkItemErrorMetric , string (instanceID ))
259
267
} else {
260
268
klog .Infof ("Finished processing %s" , workItem )
261
- timeTaken = time .Since (workItem .enqueueTime ).Seconds ()
262
- recordWorkItemLatencyMetrics (workItemProcessingTimeWorkItemMetric , timeTaken )
263
- klog .Infof ("Processing latency %f seconds" , timeTaken )
264
269
}
265
270
266
271
tc .workqueue .Forget (obj )
@@ -277,11 +282,19 @@ func (tc *Controller) process() bool {
277
282
278
283
// tagNodesResources tag node resources
279
284
// If we want to tag more resources, modify this function appropriately
280
- func (tc * Controller ) tagNodesResources (node * v1. Node ) error {
285
+ func (tc * Controller ) tagNodesResources (node * taggingControllerNode ) error {
281
286
for _ , resource := range tc .resources {
282
287
switch resource {
283
288
case opt .Instance :
284
- err := tc .tagEc2Instance (node )
289
+ v1node , err := tc .nodeInformer .Lister ().Get (node .name )
290
+ if err != nil {
291
+ // If node not found, just ignore it as its okay to not add tags when the node object is deleted.
292
+ if apierrors .IsNotFound (err ) {
293
+ return nil
294
+ }
295
+ return err
296
+ }
297
+ err = tc .tagEc2Instance (v1node )
285
298
if err != nil {
286
299
return err
287
300
}
@@ -334,7 +347,7 @@ func (tc *Controller) tagEc2Instance(node *v1.Node) error {
334
347
335
348
// untagNodeResources untag node resources
336
349
// If we want to untag more resources, modify this function appropriately
337
- func (tc * Controller ) untagNodeResources (node * v1. Node ) error {
350
+ func (tc * Controller ) untagNodeResources (node * taggingControllerNode ) error {
338
351
for _ , resource := range tc .resources {
339
352
switch resource {
340
353
case opt .Instance :
@@ -350,13 +363,13 @@ func (tc *Controller) untagNodeResources(node *v1.Node) error {
350
363
351
364
// untagEc2Instances deletes the provided tags to each EC2 instances in
352
365
// the cluster.
353
- func (tc * Controller ) untagEc2Instance (node * v1. Node ) error {
354
- instanceID , _ := awsv1 .KubernetesInstanceID (node .Spec . ProviderID ).MapToAWSInstanceID ()
366
+ func (tc * Controller ) untagEc2Instance (node * taggingControllerNode ) error {
367
+ instanceID , _ := awsv1 .KubernetesInstanceID (node .providerID ).MapToAWSInstanceID ()
355
368
356
369
err := tc .cloud .UntagResource (string (instanceID ), tc .tags )
357
370
358
371
if err != nil {
359
- klog .Errorf ("Error in untagging EC2 instance %s for node %s, error: %v" , instanceID , node .GetName () , err )
372
+ klog .Errorf ("Error in untagging EC2 instance %s for node %s, error: %v" , instanceID , node .name , err )
360
373
return err
361
374
}
362
375
@@ -367,12 +380,13 @@ func (tc *Controller) untagEc2Instance(node *v1.Node) error {
367
380
368
381
// enqueueNode takes in the object and an
369
382
// action for the object for a workitem and enqueue to the workqueue
370
- func (tc * Controller ) enqueueNode (node * v1.Node , action func (node * v1.Node ) error ) {
371
- item := & workItem {
372
- node : node ,
373
- action : action ,
374
- requeuingCount : 0 ,
375
- enqueueTime : time .Now (),
383
+ func (tc * Controller ) enqueueNode (node * v1.Node , action string ) {
384
+ // if the struct has fields which are all comparable then the workqueue add will handle make sure multiple adds of the same object
385
+ // will only have one item in the workqueue.
386
+ item := workItem {
387
+ name : node .GetName (),
388
+ providerID : node .Spec .ProviderID ,
389
+ action : action ,
376
390
}
377
391
378
392
if tc .rateLimitEnabled {
0 commit comments