Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Ignore pods awaiting garbage collection during topology calculat… #642

Merged
merged 2 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ jobs:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Save tag as an environment variable
run: |
tag=$(git describe --tags --exact-match)
echo "TAG=$tag" >> $GITHUB_ENV
- name: Create Github Release
uses: "marvinpinto/action-automatic-releases@latest"
with:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,21 @@ var _ = Describe("Topology", func() {
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2))
})

It("should ignore pods if node does not exist", func() {
topology := []v1.TopologySpreadConstraint{{
TopologyKey: v1.LabelTopologyZone,
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
MaxSkew: 1,
}}
podAwaitingGC := test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology, NodeName: "does-not-exist"})
ExpectApplied(ctx, env.Client, provisioner, podAwaitingGC)
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov,
test.UnschedulablePods(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}, 4)...,
)
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1, 1, 2))
})

Context("Zonal", func() {
It("should balance pods across zones (match labels)", func() {
topology := []v1.TopologySpreadConstraint{{
Expand Down
12 changes: 12 additions & 0 deletions pkg/controllers/provisioning/scheduling/topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import (
"github.com/aws/karpenter-core/pkg/scheduling"
"github.com/aws/karpenter-core/pkg/utils/functional"

"k8s.io/apimachinery/pkg/api/errors"

"go.uber.org/multierr"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -229,6 +231,8 @@ func (t *Topology) updateInverseAntiAffinity(ctx context.Context, pod *v1.Pod, d

// countDomains initializes the topology group by registereding any well known domains and performing pod counts
// against the cluster for any existing pods.
//
//nolint:gocyclo
func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error {
podList := &v1.PodList{}

Expand All @@ -252,6 +256,14 @@ func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error {
}
node := &v1.Node{}
if err := t.kubeClient.Get(ctx, types.NamespacedName{Name: p.Spec.NodeName}, node); err != nil {
// Pods that cannot be evicted can be leaked in the API Server after
// a Node is removed. Since pod bindings are immutable, these pods
// cannot be recovered, and will be deleted by the pod lifecycle
// garbage collector. These pods are not running, and should not
// impact future topology calculations.
if errors.IsNotFound(err) {
continue
}
return fmt.Errorf("getting node %s, %w", p.Spec.NodeName, err)
}
domain, ok := node.Labels[tg.Key]
Expand Down