kubernetes-sigs · sadath-12 · Jan 7, 2024 · Jan 7, 2024 · Jan 7, 2024 · Jan 8, 2024
diff --git a/pkg/cloudprovider/fake/cloudprovider.go b/pkg/cloudprovider/fake/cloudprovider.go
@@ -96,10 +96,14 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *v1beta1.NodeClaim
 	reqs := scheduling.NewNodeSelectorRequirements(nodeClaim.Spec.Requirements...)
 	np := &v1beta1.NodePool{ObjectMeta: metav1.ObjectMeta{Name: nodeClaim.Labels[v1beta1.NodePoolLabelKey]}}
 	instanceTypes := lo.Filter(lo.Must(c.GetInstanceTypes(ctx, np)), func(i *cloudprovider.InstanceType, _ int) bool {
-		return reqs.Compatible(i.Requirements, scheduling.AllowUndefinedWellKnownLabels) == nil &&
-			len(i.Offerings.Requirements(reqs).Available()) > 0 &&
-			resources.Fits(nodeClaim.Spec.Resources.Requests, i.Allocatable())
+		compatible := reqs.Compatible(i.Requirements, scheduling.AllowUndefinedWellKnownLabels) == nil
+		availableOfferings := len(i.Offerings.Requirements(reqs).Available()) > 0
+		fits, _ := resources.Fits(nodeClaim.Spec.Resources.Requests, i.Allocatable())
+
+		// Check if all conditions are met
+		return compatible && availableOfferings && fits
 	})
+
 	// Order instance types so that we get the cheapest instance types of the available offerings
 	sort.Slice(instanceTypes, func(i, j int) bool {
 		iOfferings := instanceTypes[i].Offerings.Available().Requirements(reqs)
@@ -189,7 +193,8 @@ func (c *CloudProvider) GetInstanceTypes(_ context.Context, np *v1beta1.NodePool
 			Name: "gpu-vendor-instance-type",
 			Resources: map[v1.ResourceName]resource.Quantity{
 				ResourceGPUVendorA: resource.MustParse("2"),
-			}}),
+			},
+		}),
 		NewInstanceType(InstanceTypeOptions{
 			Name: "gpu-vendor-b-instance-type",
 			Resources: map[v1.ResourceName]resource.Quantity{

diff --git a/pkg/controllers/provisioning/scheduling/existingnode.go b/pkg/controllers/provisioning/scheduling/existingnode.go
@@ -84,7 +84,8 @@ func (n *ExistingNode) Add(ctx context.Context, kubeClient client.Client, pod *v
 	// node, which at this point can't be increased in size
 	requests := resources.Merge(n.requests, resources.RequestsForPods(pod))
 
-	if !resources.Fits(requests, n.Available()) {
+	fits, _ := resources.Fits(requests, n.Available())
+	if !fits {
 		return fmt.Errorf("exceeds node resources")
 	}
 

diff --git a/pkg/controllers/provisioning/scheduling/nodeclaim.go b/pkg/controllers/provisioning/scheduling/nodeclaim.go
@@ -22,7 +22,6 @@ import (
 	"sync/atomic"
 
 	v1 "k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/resource"
 
 	"sigs.k8s.io/karpenter/pkg/apis/v1beta1"
 	"sigs.k8s.io/karpenter/pkg/cloudprovider"
@@ -103,9 +102,7 @@ func (n *NodeClaim) Add(pod *v1.Pod) error {
 	requests := resources.Merge(n.Spec.Resources.Requests, resources.RequestsForPods(pod))
 	filtered := filterInstanceTypesByRequirements(n.InstanceTypeOptions, nodeClaimRequirements, requests)
 	if len(filtered.remaining) == 0 {
-		// log the total resources being requested (daemonset + the pod)
-		cumulativeResources := resources.Merge(n.daemonResources, resources.RequestsForPods(pod))
-		return fmt.Errorf("no instance type satisfied resources %s and requirements %s (%s)", resources.String(cumulativeResources), nodeClaimRequirements, filtered.FailureReason())
+		return fmt.Errorf(filtered.reason)
 	}
 
 	// Update node
@@ -156,79 +153,17 @@ type filterResults struct {
 	fitsAndOffering bool
 
 	requests v1.ResourceList
-}
-
-// FailureReason returns a presentable string explaining why all instance types were filtered out
-//
-//nolint:gocyclo
-func (r filterResults) FailureReason() string {
-	if len(r.remaining) > 0 {
-		return ""
-	}
-
-	// no instance type met any of the three criteria, meaning each criteria was enough to completely prevent
-	// this pod from scheduling
-	if !r.requirementsMet && !r.fits && !r.hasOffering {
-		return "no instance type met the scheduling requirements or had enough resources or had a required offering"
-	}
-
-	// check the other pairwise criteria
-	if !r.requirementsMet && !r.fits {
-		return "no instance type met the scheduling requirements or had enough resources"
-	}
-
-	if !r.requirementsMet && !r.hasOffering {
-		return "no instance type met the scheduling requirements or had a required offering"
-	}
-
-	if !r.fits && !r.hasOffering {
-		return "no instance type had enough resources or had a required offering"
-	}
-
-	// and then each individual criteria. These are sort of the same as above in that each one indicates that no
-	// instance type matched that criteria at all, so it was enough to exclude all instance types.  I think it's
-	// helpful to have these separate, since we can report the multiple excluding criteria above.
-	if !r.requirementsMet {
-		return "no instance type met all requirements"
-	}
-
-	if !r.fits {
-		msg := "no instance type has enough resources"
-		// special case for a user typo I saw reported once
-		if r.requests.Cpu().Cmp(resource.MustParse("1M")) >= 0 {
-			msg += " (CPU request >= 1 Million, m vs M typo?)"
-		}
-		return msg
-	}
-
-	if !r.hasOffering {
-		return "no instance type has the required offering"
-	}
-
-	// see if any pair of criteria was enough to exclude all instances
-	if r.requirementsAndFits {
-		return "no instance type which met the scheduling requirements and had enough resources, had a required offering"
-	}
-	if r.fitsAndOffering {
-		return "no instance type which had enough resources and the required offering met the scheduling requirements"
-	}
-	if r.requirementsAndOffering {
-		return "no instance type which met the scheduling requirements and the required offering had the required resources"
-	}
-
-	// finally all instances were filtered out, but we had at least one instance that met each criteria, and met each
-	// pairwise set of criteria, so the only thing that remains is no instance which met all three criteria simultaneously
-	return "no instance type met the requirements/resources/offering tuple"
+	reason   string
 }
 
 //nolint:gocyclo
 func filterInstanceTypesByRequirements(instanceTypes []*cloudprovider.InstanceType, requirements scheduling.Requirements, requests v1.ResourceList) filterResults {
 	results := filterResults{
-		requests:        requests,
-		requirementsMet: false,
-		fits:            false,
-		hasOffering:     false,
-
+		requests:                requests,
+		requirementsMet:         false,
+		fits:                    false,
+		hasOffering:             false,
+		reason:                  "",
 		requirementsAndFits:     false,
 		requirementsAndOffering: false,
 		fitsAndOffering:         false,
@@ -237,8 +172,8 @@ func filterInstanceTypesByRequirements(instanceTypes []*cloudprovider.InstanceTy
 		// the tradeoff to not short circuiting on the filtering is that we can report much better error messages
 		// about why scheduling failed
 		itCompat := compatible(it, requirements)
-		itFits := fits(it, requests)
-		itHasOffering := hasOffering(it, requirements)
+		itFits, itUnfitReason := fits(it, requests)
+		itHasOffering, offeringReason := hasOffering(it, requirements)
 
 		// track if any single instance type met a single criteria
 		results.requirementsMet = results.requirementsMet || itCompat
@@ -254,6 +189,22 @@ func filterInstanceTypesByRequirements(instanceTypes []*cloudprovider.InstanceTy
 		// any errors.
 		if itCompat && itFits && itHasOffering {
 			results.remaining = append(results.remaining, it)
+		} else {
+			if !itFits && !itHasOffering && !results.requirementsMet {
+				results.reason = fmt.Sprintf("%s, %s, %s, %s", it.Name, itUnfitReason, offeringReason, it.Requirements.Intersects(requirements))
+			} else if !itFits && !itHasOffering {
+				results.reason = fmt.Sprintf("%s, %s, %s", it.Name, itUnfitReason, offeringReason)
+			} else if !itCompat && !itHasOffering {
+				results.reason = fmt.Sprintf("%s, %s, %s", it.Name, it.Requirements.Intersects(requirements), offeringReason)
+			} else if !itCompat && !itFits {
+				results.reason = fmt.Sprintf("%s, %s, %s", it.Name, it.Requirements.Intersects(requirements), itUnfitReason)
+			} else if !itCompat {
+				results.reason = fmt.Sprintf("%s, %s", it.Name, it.Requirements.Intersects(requirements))
+			} else if !itFits {
+				results.reason = fmt.Sprintf("%s, %s", it.Name, itUnfitReason)
+			} else if !itHasOffering {
+				results.reason = fmt.Sprintf("%s, %s", it.Name, offeringReason)
+			}
 		}
 	}
 	return results
@@ -263,16 +214,16 @@ func compatible(instanceType *cloudprovider.InstanceType, requirements schedulin
 	return instanceType.Requirements.Intersects(requirements) == nil
 }
 
-func fits(instanceType *cloudprovider.InstanceType, requests v1.ResourceList) bool {
+func fits(instanceType *cloudprovider.InstanceType, requests v1.ResourceList) (bool, string) {
 	return resources.Fits(requests, instanceType.Allocatable())
 }
 
-func hasOffering(instanceType *cloudprovider.InstanceType, requirements scheduling.Requirements) bool {
+func hasOffering(instanceType *cloudprovider.InstanceType, requirements scheduling.Requirements) (bool, string) {
 	for _, offering := range instanceType.Offerings.Available() {
 		if (!requirements.Has(v1.LabelTopologyZone) || requirements.Get(v1.LabelTopologyZone).Has(offering.Zone)) &&
 			(!requirements.Has(v1beta1.CapacityTypeLabelKey) || requirements.Get(v1beta1.CapacityTypeLabelKey).Has(offering.CapacityType)) {
-			return true
+			return true, ""
 		}
 	}
-	return false
+	return false, fmt.Sprintf("no offering found for requirements %s", requirements)
 }
diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go
@@ -49,8 +49,8 @@ type SchedulerOptions struct {
 func NewScheduler(ctx context.Context, kubeClient client.Client, nodeClaimTemplates []*NodeClaimTemplate,
 	nodePools []v1beta1.NodePool, cluster *state.Cluster, stateNodes []*state.StateNode, topology *Topology,
 	instanceTypes map[string][]*cloudprovider.InstanceType, daemonSetPods []*v1.Pod,
-	recorder events.Recorder, opts SchedulerOptions) *Scheduler {
-
+	recorder events.Recorder, opts SchedulerOptions,
+) *Scheduler {
 	// if any of the nodePools add a taint with a prefer no schedule effect, we add a toleration for the taint
 	// during preference relaxation
 	toleratePreferNoSchedule := false
@@ -270,10 +270,7 @@ func (s *Scheduler) add(ctx context.Context, pod *v1.Pod) error {
 		}
 		nodeClaim := NewNodeClaim(nodeClaimTemplate, s.topology, s.daemonOverhead[nodeClaimTemplate], instanceTypes)
 		if err := nodeClaim.Add(pod); err != nil {
-			errs = multierr.Append(errs, fmt.Errorf("incompatible with nodepool %q, daemonset overhead=%s, %w",
-				nodeClaimTemplate.NodePoolName,
-				resources.String(s.daemonOverhead[nodeClaimTemplate]),
-				err))
+			errs = multierr.Append(errs, fmt.Errorf("incompatible with nodepool %q, %w", nodeClaimTemplate.NodePoolName, err))
 			continue
 		}
 		// we will launch this nodeClaim and need to track its maximum possible resource usage against our remaining resources

diff --git a/pkg/utils/resources/resources.go b/pkg/utils/resources/resources.go
@@ -17,6 +17,8 @@ limitations under the License.
 package resources
 
 import (
+	"fmt"
+
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 
@@ -159,19 +161,20 @@ func Cmp(lhs resource.Quantity, rhs resource.Quantity) int {
 }
 
 // Fits returns true if the candidate set of resources is less than or equal to the total set of resources.
-func Fits(candidate, total v1.ResourceList) bool {
+func Fits(candidate, total v1.ResourceList) (bool, string) {
 	// If any of the total resource values are negative then the resource will never fit
 	for _, quantity := range total {
 		if Cmp(resource.MustParse("0"), quantity) > 0 {
-			return false
+			return false, ""
 		}
 	}
 	for resourceName, quantity := range candidate {
 		if Cmp(quantity, total[resourceName]) > 0 {
-			return false
+			return false, fmt.Sprintf("required resource %s (%s) is greater than total resource %s (%v)",
+				resourceName, quantity.String(), resourceName.String(), total[resourceName])
 		}
 	}
-	return true
+	return true, ""
 }
 
 // String returns a string version of the resource list suitable for presenting in a log