Skip to content

Commit

Permalink
Merge pull request #6049 from Martchus/fix-scheduler
Browse files Browse the repository at this point in the history
Avoid scheduling jobs if not all parallel jobs are ready
  • Loading branch information
mergify[bot] authored Nov 8, 2024
2 parents 8614bef + 6045c27 commit 2c4a234
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 23 deletions.
43 changes: 20 additions & 23 deletions lib/OpenQA/Scheduler/Model/Jobs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,11 @@ sub _allocate_jobs ($self, $free_workers) {
next if $checked_jobs{$j->{id}};
next unless @{$j->{matching_workers}};
my $tobescheduled = _to_be_scheduled($j, $scheduled_jobs);
if (!defined $tobescheduled) {
log_debug "Skipping job $j->{id} because dependent jobs are not ready";
next;
}
next if defined $allocated_jobs->{$j->{id}};
next unless $tobescheduled;
OpenQA::Scheduler::WorkerSlotPicker->new($tobescheduled)->pick_slots_with_common_worker_host;
my @tobescheduled = grep { $_->{id} } @$tobescheduled;
my $parallel_count = scalar(@tobescheduled);
Expand Down Expand Up @@ -388,30 +391,24 @@ sub _pick_siblings_of_running ($self, $allocated_jobs, $allocated_workers) {
}
}

sub _to_be_scheduled_recurse ($j, $scheduled, $taken) {
return undef unless $j;
return undef unless $j->{id};
return undef if $taken->{$j->{id}};
# if we were called with undef, this is a sign that
# the cluster is not fully scheduled (e.g. blocked_by), so
# take that as mark but return
$taken->{$j->{id}} = $j;

my $ci = $j->{cluster_jobs}->{$j->{id}};
return undef unless $ci;
for my $s (@{$ci->{parallel_children}}) {
_to_be_scheduled_recurse($scheduled->{$s}, $scheduled, $taken);
}
for my $s (@{$ci->{parallel_parents}}) {
_to_be_scheduled_recurse($scheduled->{$s}, $scheduled, $taken);
}
sub _to_be_scheduled_recurse ($job_info, $scheduled, $visited) {
# return falsy value if the cluster is not fully scheduled (e.g. blocked_by is set or there are Gru dependencies)
return 0 unless defined($job_info) && defined(my $job_id = $job_info->{id});

# keep track of visited jobs; return early if already visited
return 1 if $visited->{$job_id};
$visited->{$job_id} = $job_info;

# traverse further through the dependency tree
return 0 unless defined(my $ci = $job_info->{cluster_jobs}->{$job_id});
_to_be_scheduled_recurse($scheduled->{$_}, $scheduled, $visited) or return 0 for @{$ci->{parallel_children}};
_to_be_scheduled_recurse($scheduled->{$_}, $scheduled, $visited) or return 0 for @{$ci->{parallel_parents}};
return 1;
}

sub _to_be_scheduled ($j, $scheduled) {
my %taken;
_to_be_scheduled_recurse($j, $scheduled, \%taken);
return undef if defined $taken{undef};
return [values %taken];
sub _to_be_scheduled ($job_info, $scheduled) {
my %visited;
_to_be_scheduled_recurse($job_info, $scheduled, \%visited) ? [values %visited] : undef;
}

sub _update_scheduled_jobs ($self) {
Expand Down
14 changes: 14 additions & 0 deletions t/05-scheduler-dependencies.t
Original file line number Diff line number Diff line change
Expand Up @@ -1278,4 +1278,18 @@ subtest 'starvation of parallel jobs prevented' => sub {
ok $_ >= 1 && $_ <= 3, "worker held for expected job ($_)" for values %$allocated_workers;
};

subtest 'partially blocked clusters are not scheduled' => sub {
# assume parallel child with ID 2 is blocked by removing it from the set of mocked jobs
delete $mocked_jobs{2};

my ($allocated_workers, $allocated_jobs);
my $free_workers = OpenQA::Scheduler::Model::Jobs::determine_free_workers();
combined_like {
($allocated_workers, $allocated_jobs) = OpenQA::Scheduler::Model::Jobs->singleton->_allocate_jobs($free_workers)
}
qr/Skipping job .* because dependent jobs are not ready/, 'skipping job if dependent jobs not ready';
is_deeply $allocated_jobs, {}, 'no jobs allocated' or diag explain $allocated_jobs;
is_deeply $allocated_workers, {}, 'no workers allocated' or diag explain $allocated_workers;
};

done_testing();

0 comments on commit 2c4a234

Please sign in to comment.