@@ -22,6 +22,7 @@ import (
22
22
23
23
"golang.org/x/time/rate"
24
24
v1 "k8s.io/api/core/v1"
25
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
25
26
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
26
27
"k8s.io/apimachinery/pkg/util/wait"
27
28
coreinformers "k8s.io/client-go/informers/core/v1"
@@ -42,16 +43,21 @@ func init() {
42
43
registerMetrics ()
43
44
}
44
45
45
- // workItem contains the node and an action for that node
46
+ // taggingControllerNode contains the node details required for tag/untag of node resources.
47
+ type taggingControllerNode struct {
48
+ providerID string
49
+ name string
50
+ }
51
+
52
+ // workItem contains the node name, provider id and an action for that node.
46
53
type workItem struct {
47
- node * v1.Node
48
- action func (node * v1.Node ) error
49
- requeuingCount int
50
- enqueueTime time.Time
54
+ name string
55
+ providerID string
56
+ action string
51
57
}
52
58
53
59
func (w workItem ) String () string {
54
- return fmt .Sprintf ("[Node: %s, RequeuingCount : %d, EnqueueTime: % s]" , w .node . GetName () , w .requeuingCount , w . enqueueTime )
60
+ return fmt .Sprintf ("[Node: %s, Action : %s]" , w .name , w .action )
55
61
}
56
62
57
63
const (
@@ -62,17 +68,15 @@ const (
62
68
// The label for depicting total number of errors a work item encounter and succeed
63
69
totalErrorsWorkItemErrorMetric = "total_errors"
64
70
65
- // The label for depicting total time when work item gets queued to processed
66
- workItemProcessingTimeWorkItemMetric = "work_item_processing_time"
67
-
68
- // The label for depicting total time when work item gets queued to dequeued
69
- workItemDequeuingTimeWorkItemMetric = "work_item_dequeuing_time"
70
-
71
71
// The label for depicting total number of errors a work item encounter and fail
72
72
errorsAfterRetriesExhaustedWorkItemErrorMetric = "errors_after_retries_exhausted"
73
73
74
74
// The period of time after Node creation to retry tagging due to eventual consistency of the CreateTags API.
75
75
newNodeEventualConsistencyGracePeriod = time .Minute * 5
76
+
77
+ addTag = "ADD"
78
+
79
+ deleteTag = "DELETE"
76
80
)
77
81
78
82
// Controller is the controller implementation for tagging cluster resources.
@@ -150,7 +154,7 @@ func NewTaggingController(
150
154
tc .nodeInformer .Informer ().AddEventHandler (cache.ResourceEventHandlerFuncs {
151
155
AddFunc : func (obj interface {}) {
152
156
node := obj .(* v1.Node )
153
- tc .enqueueNode (node , tc . tagNodesResources )
157
+ tc .enqueueNode (node , addTag )
154
158
},
155
159
UpdateFunc : func (oldObj , newObj interface {}) {
156
160
node := newObj .(* v1.Node )
@@ -163,11 +167,11 @@ func NewTaggingController(
163
167
return
164
168
}
165
169
166
- tc .enqueueNode (node , tc . tagNodesResources )
170
+ tc .enqueueNode (node , addTag )
167
171
},
168
172
DeleteFunc : func (obj interface {}) {
169
173
node := obj .(* v1.Node )
170
- tc .enqueueNode (node , tc . untagNodeResources )
174
+ tc .enqueueNode (node , deleteTag )
171
175
},
172
176
})
173
177
@@ -213,21 +217,17 @@ func (tc *Controller) process() bool {
213
217
err := func (obj interface {}) error {
214
218
defer tc .workqueue .Done (obj )
215
219
216
- workItem , ok := obj .(* workItem )
220
+ workItem , ok := obj .(workItem )
217
221
if ! ok {
218
222
tc .workqueue .Forget (obj )
219
223
err := fmt .Errorf ("expected workItem in workqueue but got %s" , obj )
220
224
utilruntime .HandleError (err )
221
225
return nil
222
226
}
223
227
224
- timeTaken := time .Since (workItem .enqueueTime ).Seconds ()
225
- recordWorkItemLatencyMetrics (workItemDequeuingTimeWorkItemMetric , timeTaken )
226
- klog .Infof ("Dequeuing latency %f seconds" , timeTaken )
227
-
228
- instanceID , err := awsv1 .KubernetesInstanceID (workItem .node .Spec .ProviderID ).MapToAWSInstanceID ()
228
+ instanceID , err := awsv1 .KubernetesInstanceID (workItem .providerID ).MapToAWSInstanceID ()
229
229
if err != nil {
230
- err = fmt .Errorf ("Error in getting instanceID for node %s, error: %v" , workItem .node . GetName () , err )
230
+ err = fmt .Errorf ("error in getting instanceID for node %s, error: %v" , workItem .name , err )
231
231
utilruntime .HandleError (err )
232
232
return nil
233
233
}
@@ -239,26 +239,31 @@ func (tc *Controller) process() bool {
239
239
tc .workqueue .Forget (obj )
240
240
return nil
241
241
}
242
-
243
- err = workItem .action (workItem .node )
244
-
242
+ if workItem .action == addTag {
243
+ err = tc .tagNodesResources (& taggingControllerNode {
244
+ name : workItem .name ,
245
+ providerID : workItem .providerID ,
246
+ })
247
+ } else {
248
+ err = tc .untagNodeResources (& taggingControllerNode {
249
+ name : workItem .name ,
250
+ providerID : workItem .providerID ,
251
+ })
252
+ }
245
253
if err != nil {
246
- if workItem .requeuingCount < maxRequeuingCount {
254
+ numRetries := tc .workqueue .NumRequeues (workItem )
255
+ if numRetries < maxRequeuingCount {
247
256
// Put the item back on the workqueue to handle any transient errors.
248
- workItem .requeuingCount ++
249
257
tc .workqueue .AddRateLimited (workItem )
250
258
251
259
recordWorkItemErrorMetrics (totalErrorsWorkItemErrorMetric , string (instanceID ))
252
- return fmt .Errorf ("error processing work item '%v': %s, requeuing count %d" , workItem , err .Error (), workItem . requeuingCount )
260
+ return fmt .Errorf ("error processing work item '%v': %s, requeuing count %d" , workItem , err .Error (), numRetries )
253
261
}
254
262
255
263
klog .Errorf ("error processing work item %s: %s, requeuing count exceeded" , workItem , err .Error ())
256
264
recordWorkItemErrorMetrics (errorsAfterRetriesExhaustedWorkItemErrorMetric , string (instanceID ))
257
265
} else {
258
266
klog .Infof ("Finished processing %s" , workItem )
259
- timeTaken = time .Since (workItem .enqueueTime ).Seconds ()
260
- recordWorkItemLatencyMetrics (workItemProcessingTimeWorkItemMetric , timeTaken )
261
- klog .Infof ("Processing latency %f seconds" , timeTaken )
262
267
}
263
268
264
269
tc .workqueue .Forget (obj )
@@ -275,11 +280,19 @@ func (tc *Controller) process() bool {
275
280
276
281
// tagNodesResources tag node resources
277
282
// If we want to tag more resources, modify this function appropriately
278
- func (tc * Controller ) tagNodesResources (node * v1. Node ) error {
283
+ func (tc * Controller ) tagNodesResources (node * taggingControllerNode ) error {
279
284
for _ , resource := range tc .resources {
280
285
switch resource {
281
286
case opt .Instance :
282
- err := tc .tagEc2Instance (node )
287
+ v1node , err := tc .nodeInformer .Lister ().Get (node .name )
288
+ if err != nil {
289
+ // If node not found, just ignore it as its okay to not add tags when the node object is deleted.
290
+ if apierrors .IsNotFound (err ) {
291
+ return nil
292
+ }
293
+ return err
294
+ }
295
+ err = tc .tagEc2Instance (v1node )
283
296
if err != nil {
284
297
return err
285
298
}
@@ -332,7 +345,7 @@ func (tc *Controller) tagEc2Instance(node *v1.Node) error {
332
345
333
346
// untagNodeResources untag node resources
334
347
// If we want to untag more resources, modify this function appropriately
335
- func (tc * Controller ) untagNodeResources (node * v1. Node ) error {
348
+ func (tc * Controller ) untagNodeResources (node * taggingControllerNode ) error {
336
349
for _ , resource := range tc .resources {
337
350
switch resource {
338
351
case opt .Instance :
@@ -348,13 +361,13 @@ func (tc *Controller) untagNodeResources(node *v1.Node) error {
348
361
349
362
// untagEc2Instances deletes the provided tags to each EC2 instances in
350
363
// the cluster.
351
- func (tc * Controller ) untagEc2Instance (node * v1. Node ) error {
352
- instanceID , _ := awsv1 .KubernetesInstanceID (node .Spec . ProviderID ).MapToAWSInstanceID ()
364
+ func (tc * Controller ) untagEc2Instance (node * taggingControllerNode ) error {
365
+ instanceID , _ := awsv1 .KubernetesInstanceID (node .providerID ).MapToAWSInstanceID ()
353
366
354
367
err := tc .cloud .UntagResource (string (instanceID ), tc .tags )
355
368
356
369
if err != nil {
357
- klog .Errorf ("Error in untagging EC2 instance %s for node %s, error: %v" , instanceID , node .GetName () , err )
370
+ klog .Errorf ("Error in untagging EC2 instance %s for node %s, error: %v" , instanceID , node .name , err )
358
371
return err
359
372
}
360
373
@@ -365,12 +378,13 @@ func (tc *Controller) untagEc2Instance(node *v1.Node) error {
365
378
366
379
// enqueueNode takes in the object and an
367
380
// action for the object for a workitem and enqueue to the workqueue
368
- func (tc * Controller ) enqueueNode (node * v1.Node , action func (node * v1.Node ) error ) {
369
- item := & workItem {
370
- node : node ,
371
- action : action ,
372
- requeuingCount : 0 ,
373
- enqueueTime : time .Now (),
381
+ func (tc * Controller ) enqueueNode (node * v1.Node , action string ) {
382
+ // if the struct has fields which are all comparable then the workqueue add will handle make sure multiple adds of the same object
383
+ // will only have one item in the workqueue.
384
+ item := workItem {
385
+ name : node .GetName (),
386
+ providerID : node .Spec .ProviderID ,
387
+ action : action ,
374
388
}
375
389
376
390
if tc .rateLimitEnabled {
0 commit comments