Skip to content

Make live-repair part of invariants checks #1610

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions tools/dtrace/upstairs_action.d
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ tick-1s
{
printf("%9s %9s %9s", "APPLY", "DOWN_S", "GUEST");
printf(" %9s %9s %9s", "DFR_BLK", "DFR_MSG", "LEAK_CHK");
printf(" %9s %9s %9s", "FLUSH_CHK", "STAT_CHK", "REPR_CHK");
printf(" %9s %9s", "FLUSH_CHK", "STAT_CHK");
printf(" %9s %9s", "CTRL_CHK", "NOOP");
printf("\n");
show = 0;
Expand All @@ -37,7 +37,6 @@ crucible_upstairs*:::up-status
printf(" %9s", json(copyinstr(arg1), "ok.up_counters.action_leak_check"));
printf(" %9s", json(copyinstr(arg1), "ok.up_counters.action_flush_check"));
printf(" %9s", json(copyinstr(arg1), "ok.up_counters.action_stat_check"));
printf(" %9s", json(copyinstr(arg1), "ok.up_counters.action_repair_check"));
printf(" %9s", json(copyinstr(arg1), "ok.up_counters.action_control_check"));
printf(" %9s", json(copyinstr(arg1), "ok.up_counters.action_noop"));
printf("\n");
Expand Down
12 changes: 7 additions & 5 deletions upstairs/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1280,14 +1280,16 @@ impl DownstairsClient {
}
}

/// Moves from `LiveRepairReady` to `LiveRepair`, a no-op otherwise
/// Moves from `LiveRepairReady` to `LiveRepair`
///
/// # Panics
/// If the state is not `Connecting { state: LiveRepairReady }`
pub(crate) fn start_live_repair(&mut self, up_state: &UpstairsState) {
let DsState::Connecting { state, .. } = self.state else {
return;
panic!("invalid state");
};
if state == NegotiationState::LiveRepairReady {
self.checked_state_transition(up_state, DsState::LiveRepair);
}
assert_eq!(state, NegotiationState::LiveRepairReady);
self.checked_state_transition(up_state, DsState::LiveRepair);
}

/// Continues the negotiation and initial reconciliation process
Expand Down
67 changes: 31 additions & 36 deletions upstairs/src/downstairs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -989,58 +989,55 @@ impl Downstairs {

/// Tries to start live-repair
///
/// Returns true on success, false otherwise; the only time it returns
/// `false` is if there are no clients in `DsState::Active` to serve as
/// sources for live-repair.
///
/// # Panics
/// If `self.repair.is_some()` (because that means a repair is already in
/// progress), or if no clients are in `LiveRepairReady`
pub(crate) fn start_live_repair(
/// This function is idempotent; it returns without doing anything if
/// live-repair either can't be started or is already running.
pub(crate) fn check_live_repair_start(
&mut self,
up_state: &UpstairsState,
extent_count: u32,
) -> bool {
assert!(self.repair.is_none());

// Move the upstairs that were LiveRepairReady to LiveRepair
//
// After this point, we must call `abort_repair` if something goes wrong
// to abort the repair on the troublesome clients.
for c in self.clients.iter_mut() {
c.start_live_repair(up_state);
) {
// If we're already doing live-repair, then we can't start live-repair
if self.live_repair_in_progress() {
return;
}

// Begin setting up live-repair state
let mut repair_downstairs = vec![];
let mut source_downstairs = None;
for cid in ClientId::iter() {
match self.clients[cid].state() {
DsState::LiveRepair => {
DsState::Connecting {
state: NegotiationState::LiveRepairReady,
..
} => {
repair_downstairs.push(cid);
}
DsState::Active => {
source_downstairs = Some(cid);
}
state => {
warn!(
self.log,
"Unknown repair action for ds:{} in state {}",
cid,
state,
);
// TODO, what other states are okay?
_state => {
// ignore Downstairs in irrelevant states
}
}
}

// Can't start live-repair if no one is LiveRepairReady
if repair_downstairs.is_empty() {
return;
}

// Can't start live-repair if we don't have a source downstairs
let Some(source_downstairs) = source_downstairs else {
error!(self.log, "failed to find source downstairs for repair");
self.abort_repair(up_state);
return false;
return;
};

assert!(!repair_downstairs.is_empty());
// Move the upstairs that were LiveRepairReady to LiveRepair
//
// After this point, we must call `abort_repair` if something goes wrong
// to abort the repair on the troublesome clients.
for &cid in &repair_downstairs {
self.clients[cid].start_live_repair(up_state);
}

// Submit the initial repair jobs, which kicks everything off
self.begin_repair_for(
Expand All @@ -1054,15 +1051,12 @@ impl Downstairs {

info!(
self.log,
"starting repair {}",
"started repair {}",
self.repair.as_ref().unwrap().id
);

let repair = self.repair.as_ref().unwrap();
self.notify_live_repair_start(repair);

// We'll be back in on_live_repair once the initial job finishes
true
}

/// Checks whether live-repair can continue
Expand Down Expand Up @@ -7925,7 +7919,7 @@ pub(crate) mod test {
};
Some(LiveRepairData {
id: Uuid::new_v4(),
extent_count: extent_count,
extent_count,
repair_downstairs: vec![ClientId::new(1)],
source_downstairs: ClientId::new(0),
aborting_repair: false,
Expand Down Expand Up @@ -9707,7 +9701,8 @@ pub(crate) mod test {

// Start the repair normally. This enqueues the close & reopen jobs, and
// reserves Job IDs for the repair/noop
assert!(ds.start_live_repair(&UpstairsState::Active, 3));
ds.check_live_repair_start(&UpstairsState::Active, 3);
assert!(ds.live_repair_in_progress());

// Submit a write.
ds.submit_test_write_block(ExtentId(0), BlockOffset(1), false);
Expand Down
1 change: 0 additions & 1 deletion upstairs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,6 @@ mod cdt {
fn up__action_leak_check(_: u64) {}
fn up__action_flush_check(_: u64) {}
fn up__action_stat_check(_: u64) {}
fn up__action_repair_check(_: u64) {}
fn up__action_control_check(_: u64) {}
fn up__action_noop(_: u64) {}
fn volume__read__start(_: u32, _: Uuid) {}
Expand Down
2 changes: 1 addition & 1 deletion upstairs/src/live_repair.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1129,7 +1129,7 @@ pub mod repair_test {
DsState::Connecting { state, mode },
);
}
up.on_repair_check();
up.check_live_repair_start();
assert!(up.downstairs.live_repair_in_progress());
assert_eq!(up.downstairs.last_repair_extent(), Some(ExtentId(0)));

Expand Down
Loading
Loading