Skip to content

Commit

Permalink
DHCP lease maintenance should terminate when interface no longer exists.
Browse files Browse the repository at this point in the history
Due to oberservations that threads can grow and the dhcp daemon uses an increasing amount of memory.

This situation can happen organically when using say, bridge CNI, and the bridge has been removed outside of the bridge CNI lifecycle, and an interface no longer exists on a pod.

Does so on a retry loop using the `backoffRetry()` method.

Signed-off-by: dougbtv <[email protected]>
  • Loading branch information
dougbtv committed Jan 22, 2025
1 parent e4ca66b commit ddad5aa
Showing 1 changed file with 59 additions and 0 deletions.
59 changes: 59 additions & 0 deletions plugins/ipam/dhcp/lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,17 @@ const (
leaseStateRebinding
)

// Timing for retrying link existence check
const (
linkCheckDelay0 = 1 * time.Second
linkCheckRetryMax = 10 * time.Second
linkCheckTotalTimeout = 30 * time.Second
)

const (
netlinkLinkNotFoundErrorString = "Link not found"
)

// This implementation uses 1 OS thread per lease. This is because
// all the network operations have to be done in network namespace
// of the interface. This can be improved by switching to the proper
Expand All @@ -65,6 +76,7 @@ type DHCPLease struct {
clientID string
latestLease *nclient4.Lease
link netlink.Link
linkexists bool

Check failure on line 79 in plugins/ipam/dhcp/lease.go

View workflow job for this annotation

GitHub Actions / Lint

field `linkexists` is unused (unused)
renewalTime time.Time
rebindingTime time.Time
expireTime time.Time
Expand Down Expand Up @@ -292,6 +304,14 @@ func (l *DHCPLease) maintain() {
for {
var sleepDur time.Duration

linkCheckCtx, cancel := context.WithTimeoutCause(l.ctx, l.resendTimeout, errNoMoreTries)
defer cancel()
linkExists, _ := l.checkLinkExistsWithBackoff(linkCheckCtx)
if !linkExists {
log.Printf("%v: interface %s no longer exists or link check failed, terminating lease maintenance", l.clientID, l.link.Attrs().Name)
return
}

switch state {
case leaseStateBound:
sleepDur = time.Until(l.renewalTime)
Expand Down Expand Up @@ -344,6 +364,45 @@ func (l *DHCPLease) maintain() {
}
}

func (l *DHCPLease) checkLinkExistsWithBackoff(ctx context.Context) (bool, error) {
checkFunc := func() (bool, error) {
_, err := netlink.LinkByName(l.link.Attrs().Name)
if err != nil {
// Capture if it contains the netlink link not found error string, otherwise, retry.
if err != nil && strings.Contains(err.Error(), netlinkLinkNotFoundErrorString) {
return false, nil
}
return false, err
}
return true, nil
}

ctx, cancel := context.WithTimeout(ctx, linkCheckTotalTimeout)
defer cancel()

exists, err := backoffRetryLinkExists(ctx, linkCheckRetryMax, checkFunc)
return exists, err
}

func backoffRetryLinkExists(ctx context.Context, maxDelay time.Duration, f func() (bool, error)) (bool, error) {
var baseDelay time.Duration = linkCheckDelay0

Check failure on line 388 in plugins/ipam/dhcp/lease.go

View workflow job for this annotation

GitHub Actions / Lint

var-declaration: should omit type time.Duration from declaration of var baseDelay; it will be inferred from the right-hand side (revive)
for {
exists, err := f()
if err == nil {
return exists, nil
}

select {
case <-ctx.Done():
return false, ctx.Err() // Context's done, return with its error
case <-time.After(baseDelay):
if baseDelay < maxDelay {
baseDelay *= 2
}
}
}
}

func (l *DHCPLease) downIface() {
if err := netlink.LinkSetDown(l.link); err != nil {
log.Printf("%v: failed to bring %v interface DOWN: %v", l.clientID, l.link.Attrs().Name, err)
Expand Down

0 comments on commit ddad5aa

Please sign in to comment.