diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 3c1ea52b6c..561d9def7c 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -9,10 +9,6 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - name: Save tag as an environment variable - run: | - tag=$(git describe --tags --exact-match) - echo "TAG=$tag" >> $GITHUB_ENV - name: Create Github Release uses: "marvinpinto/action-automatic-releases@latest" with: diff --git a/pkg/controllers/provisioning/scheduling/provisioner_topology_test.go b/pkg/controllers/provisioning/scheduling/provisioner_topology_test.go index 831815508e..4466ac732f 100644 --- a/pkg/controllers/provisioning/scheduling/provisioner_topology_test.go +++ b/pkg/controllers/provisioning/scheduling/provisioner_topology_test.go @@ -78,6 +78,21 @@ var _ = Describe("Topology", func() { ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2)) }) + It("should ignore pods if node does not exist", func() { + topology := []v1.TopologySpreadConstraint{{ + TopologyKey: v1.LabelTopologyZone, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + MaxSkew: 1, + }} + podAwaitingGC := test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology, NodeName: "does-not-exist"}) + ExpectApplied(ctx, env.Client, provisioner, podAwaitingGC) + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, + test.UnschedulablePods(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}, 4)..., + ) + ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1, 1, 2)) + }) + Context("Zonal", func() { It("should balance pods across zones (match labels)", func() { topology := []v1.TopologySpreadConstraint{{ diff --git a/pkg/controllers/provisioning/scheduling/topology.go b/pkg/controllers/provisioning/scheduling/topology.go index c178d37ba1..32d3cc73c2 100644 --- a/pkg/controllers/provisioning/scheduling/topology.go +++ b/pkg/controllers/provisioning/scheduling/topology.go @@ -23,6 +23,8 @@ import ( "github.com/aws/karpenter-core/pkg/scheduling" "github.com/aws/karpenter-core/pkg/utils/functional" + "k8s.io/apimachinery/pkg/api/errors" + "go.uber.org/multierr" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -229,6 +231,8 @@ func (t *Topology) updateInverseAntiAffinity(ctx context.Context, pod *v1.Pod, d // countDomains initializes the topology group by registereding any well known domains and performing pod counts // against the cluster for any existing pods. +// +//nolint:gocyclo func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error { podList := &v1.PodList{} @@ -252,6 +256,14 @@ func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error { } node := &v1.Node{} if err := t.kubeClient.Get(ctx, types.NamespacedName{Name: p.Spec.NodeName}, node); err != nil { + // Pods that cannot be evicted can be leaked in the API Server after + // a Node is removed. Since pod bindings are immutable, these pods + // cannot be recovered, and will be deleted by the pod lifecycle + // garbage collector. These pods are not running, and should not + // impact future topology calculations. + if errors.IsNotFound(err) { + continue + } return fmt.Errorf("getting node %s, %w", p.Spec.NodeName, err) } domain, ok := node.Labels[tg.Key]