From ddad5aa9f54b4b3753ae755372106345242638e6 Mon Sep 17 00:00:00 2001 From: dougbtv Date: Tue, 21 Jan 2025 11:00:48 -0500 Subject: [PATCH] DHCP lease maintenance should terminate when interface no longer exists. Due to oberservations that threads can grow and the dhcp daemon uses an increasing amount of memory. This situation can happen organically when using say, bridge CNI, and the bridge has been removed outside of the bridge CNI lifecycle, and an interface no longer exists on a pod. Does so on a retry loop using the `backoffRetry()` method. Signed-off-by: dougbtv --- plugins/ipam/dhcp/lease.go | 59 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/plugins/ipam/dhcp/lease.go b/plugins/ipam/dhcp/lease.go index ffc379570..e24ea7cac 100644 --- a/plugins/ipam/dhcp/lease.go +++ b/plugins/ipam/dhcp/lease.go @@ -55,6 +55,17 @@ const ( leaseStateRebinding ) +// Timing for retrying link existence check +const ( + linkCheckDelay0 = 1 * time.Second + linkCheckRetryMax = 10 * time.Second + linkCheckTotalTimeout = 30 * time.Second +) + +const ( + netlinkLinkNotFoundErrorString = "Link not found" +) + // This implementation uses 1 OS thread per lease. This is because // all the network operations have to be done in network namespace // of the interface. This can be improved by switching to the proper @@ -65,6 +76,7 @@ type DHCPLease struct { clientID string latestLease *nclient4.Lease link netlink.Link + linkexists bool renewalTime time.Time rebindingTime time.Time expireTime time.Time @@ -292,6 +304,14 @@ func (l *DHCPLease) maintain() { for { var sleepDur time.Duration + linkCheckCtx, cancel := context.WithTimeoutCause(l.ctx, l.resendTimeout, errNoMoreTries) + defer cancel() + linkExists, _ := l.checkLinkExistsWithBackoff(linkCheckCtx) + if !linkExists { + log.Printf("%v: interface %s no longer exists or link check failed, terminating lease maintenance", l.clientID, l.link.Attrs().Name) + return + } + switch state { case leaseStateBound: sleepDur = time.Until(l.renewalTime) @@ -344,6 +364,45 @@ func (l *DHCPLease) maintain() { } } +func (l *DHCPLease) checkLinkExistsWithBackoff(ctx context.Context) (bool, error) { + checkFunc := func() (bool, error) { + _, err := netlink.LinkByName(l.link.Attrs().Name) + if err != nil { + // Capture if it contains the netlink link not found error string, otherwise, retry. + if err != nil && strings.Contains(err.Error(), netlinkLinkNotFoundErrorString) { + return false, nil + } + return false, err + } + return true, nil + } + + ctx, cancel := context.WithTimeout(ctx, linkCheckTotalTimeout) + defer cancel() + + exists, err := backoffRetryLinkExists(ctx, linkCheckRetryMax, checkFunc) + return exists, err +} + +func backoffRetryLinkExists(ctx context.Context, maxDelay time.Duration, f func() (bool, error)) (bool, error) { + var baseDelay time.Duration = linkCheckDelay0 + for { + exists, err := f() + if err == nil { + return exists, nil + } + + select { + case <-ctx.Done(): + return false, ctx.Err() // Context's done, return with its error + case <-time.After(baseDelay): + if baseDelay < maxDelay { + baseDelay *= 2 + } + } + } +} + func (l *DHCPLease) downIface() { if err := netlink.LinkSetDown(l.link); err != nil { log.Printf("%v: failed to bring %v interface DOWN: %v", l.clientID, l.link.Attrs().Name, err)