Skip to content

Commit ed94802

Browse files
Extend WaitForPodsReady API with RecoveryTimeout field
1 parent 88d83ff commit ed94802

File tree

9 files changed

+60
-1
lines changed

9 files changed

+60
-1
lines changed

apis/config/v1beta1/configuration_types.go

+10
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,16 @@ type WaitForPodsReady struct {
211211
// +optional
212212
Timeout *metav1.Duration `json:"timeout,omitempty"`
213213

214+
// RecoveryTimeout defines an optional timeout, measured since the
215+
// last transition to the PodsReady=false condition after a Workload is Admitted and running.
216+
// Such a transition may happen when a Pod failed and the replacement Pod
217+
// is awaited to be scheduled.
218+
// After exceeding the timeout the corresponding job gets suspended again
219+
// and requeued after the backoff delay. The timeout is enforced only if waitForPodsReady.enable=true.
220+
// Defaults to 3 mins.
221+
// +optional
222+
RecoveryTimeout *metav1.Duration `json:"recoveryTimeout,omitempty"`
223+
214224
// BlockAdmission when true, cluster queue will block admissions for all
215225
// subsequent jobs until the jobs reach the PodsReady=true condition.
216226
// This setting is only honored when `Enable` is set to true.

apis/config/v1beta1/defaults.go

+4
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ const (
4141
DefaultClientConnectionQPS float32 = 20.0
4242
DefaultClientConnectionBurst int32 = 30
4343
defaultPodsReadyTimeout = 5 * time.Minute
44+
defaultPodsRecoveryTimeout = 3 * time.Minute
4445
DefaultQueueVisibilityUpdateIntervalSeconds int32 = 5
4546
DefaultClusterQueuesMaxCount int32 = 10
4647
defaultJobFrameworkName = "batch/job"
@@ -119,6 +120,9 @@ func SetDefaults_Configuration(cfg *Configuration) {
119120
if cfg.WaitForPodsReady.Timeout == nil {
120121
cfg.WaitForPodsReady.Timeout = &metav1.Duration{Duration: defaultPodsReadyTimeout}
121122
}
123+
if cfg.WaitForPodsReady.RecoveryTimeout == nil {
124+
cfg.WaitForPodsReady.RecoveryTimeout = &metav1.Duration{Duration: defaultPodsReadyTimeout}
125+
}
122126
if cfg.WaitForPodsReady.BlockAdmission == nil {
123127
defaultBlockAdmission := true
124128
if !cfg.WaitForPodsReady.Enable {

apis/config/v1beta1/zz_generated.deepcopy.go

+5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/config/config_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,7 @@ webhook:
629629
Enable: true,
630630
BlockAdmission: ptr.To(false),
631631
Timeout: &metav1.Duration{Duration: 50 * time.Second},
632+
RecoveryTimeout: &metav1.Duration{Duration: 3 * time.Minute},
632633
RequeuingStrategy: &configapi.RequeuingStrategy{
633634
Timestamp: ptr.To(configapi.CreationTimestamp),
634635
BackoffLimitCount: ptr.To[int32](10),

pkg/config/validation.go

+4
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,10 @@ func validateWaitForPodsReady(c *configapi.Configuration) field.ErrorList {
123123
allErrs = append(allErrs, field.Invalid(waitForPodsReadyPath.Child("timeout"),
124124
c.WaitForPodsReady.Timeout, apimachineryvalidation.IsNegativeErrorMsg))
125125
}
126+
if c.WaitForPodsReady.RecoveryTimeout != nil && c.WaitForPodsReady.RecoveryTimeout.Duration < 0 {
127+
allErrs = append(allErrs, field.Invalid(waitForPodsReadyPath.Child("recoveryTimeout"),
128+
c.WaitForPodsReady.RecoveryTimeout, apimachineryvalidation.IsNegativeErrorMsg))
129+
}
126130
if strategy := c.WaitForPodsReady.RequeuingStrategy; strategy != nil {
127131
if strategy.Timestamp != nil &&
128132
*strategy.Timestamp != configapi.CreationTimestamp && *strategy.Timestamp != configapi.EvictionTimestamp {

pkg/config/validation_test.go

+20
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,23 @@ func TestValidate(t *testing.T) {
429429
},
430430
},
431431
},
432+
"negative waitForPodsReady.recoveryTimeout": {
433+
cfg: &configapi.Configuration{
434+
Integrations: defaultIntegrations,
435+
WaitForPodsReady: &configapi.WaitForPodsReady{
436+
Enable: true,
437+
RecoveryTimeout: &metav1.Duration{
438+
Duration: -1,
439+
},
440+
},
441+
},
442+
wantErr: field.ErrorList{
443+
&field.Error{
444+
Type: field.ErrorTypeInvalid,
445+
Field: "waitForPodsReady.recoveryTimeout",
446+
},
447+
},
448+
},
432449
"valid waitForPodsReady": {
433450
cfg: &configapi.Configuration{
434451
Integrations: defaultIntegrations,
@@ -437,6 +454,9 @@ func TestValidate(t *testing.T) {
437454
Timeout: &metav1.Duration{
438455
Duration: 50,
439456
},
457+
RecoveryTimeout: &metav1.Duration{
458+
Duration: 5,
459+
},
440460
BlockAdmission: ptr.To(false),
441461
RequeuingStrategy: &configapi.RequeuingStrategy{
442462
Timestamp: ptr.To(configapi.CreationTimestamp),

pkg/controller/core/core.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ func waitForPodsReady(cfg *configapi.WaitForPodsReady) *waitForPodsReadyConfig {
9292
return nil
9393
}
9494
result := waitForPodsReadyConfig{
95-
timeout: cfg.Timeout.Duration,
95+
timeout: cfg.Timeout.Duration,
96+
recoveryTimeout: cfg.RecoveryTimeout.Duration,
9697
}
9798
if cfg.RequeuingStrategy != nil {
9899
result.requeuingBackoffBaseSeconds = *cfg.RequeuingStrategy.BackoffBaseSeconds

pkg/controller/core/workload_controller.go

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ var (
6262

6363
type waitForPodsReadyConfig struct {
6464
timeout time.Duration
65+
recoveryTimeout time.Duration
6566
requeuingBackoffLimitCount *int32
6667
requeuingBackoffBaseSeconds int32
6768
requeuingBackoffMaxDuration time.Duration

site/content/en/docs/reference/kueue-config.v1beta1.md

+13
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,19 @@ evicted and requeued in the same cluster queue.
902902
Defaults to 5min.</p>
903903
</td>
904904
</tr>
905+
<tr><td><code>recoveryTimeout</code><br/>
906+
<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#duration-v1-meta"><code>k8s.io/apimachinery/pkg/apis/meta/v1.Duration</code></a>
907+
</td>
908+
<td>
909+
<p>RecoveryTimeout defines an optional timeout, measured since the
910+
last transition to the PodsReady=false condition after a Workload is Admitted and running.
911+
Such a transition may happen when a Pod failed and the replacement Pod
912+
is awaited to be scheduled.
913+
After exceeding the timeout the corresponding job gets suspended again
914+
and requeued after the backoff delay. The timeout is enforced only if waitForPodsReady.enable=true.
915+
Defaults to 3 mins.</p>
916+
</td>
917+
</tr>
905918
<tr><td><code>blockAdmission</code> <B>[Required]</B><br/>
906919
<code>bool</code>
907920
</td>

0 commit comments

Comments
 (0)