More multiple region support. (#1484)

leftwo · Alan Hanson · web-flow · commit 38283806644b · 2024-10-05T12:42:26.000-07:00
Even more updates to support Volume layer activities.

Added a new test to CI that will run "test_up.sh encrypted" with two
region sets (6 downstairs).

Crutest changes:
Added some comments to crutest tests that were missing them.
Crutest will now determine the total number of downstairs it expects to
be using and compare that value when looking for an expected number of
downstairs in an active state.
Updated replay, replace, replace-before-active, and
replace-while-reconcile tests to support having more than one region
set.
Fixed a bug in the replace-before-active test where previous tests could
leave things broken in a way the test could not repair.

Removed the no longer valid NEW columns from the DTrace
`single_up_info.d` script.

test_up.sh changes:
Added support for running all the tests with a user supplied number of
region sets instead of just defaulting to one (three downstairs).
Some other misc cleanup of test_up.sh.

Fixed a few other bugs in tests where we did not verify that new state
requested of dsc had been reached before continuing.
---------

Co-authored-by: Alan Hanson &lt;alan@oxide.computer&gt;
diff --git a/.github/buildomat/jobs/test-up-2region-encrypted.sh b/.github/buildomat/jobs/test-up-2region-encrypted.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#:
+#: name = "test-up-2region-encrypted"
+#: variety = "basic"
+#: target = "helios-2.0"
+#: output_rules = [
+#:	"%/tmp/test_up*/*.txt",
+#:	"%/tmp/test_up*/dsc/*.txt",
+#:	"%/tmp/debug/*",
+#:	"/tmp/core.*",
+#: ]
+#: skip_clone = true
+#:
+#: [dependencies.build]
+#: job = "build"
+
+input="/input/build/work"
+
+set -o errexit
+set -o pipefail
+set -o xtrace
+
+banner cores
+pfexec coreadm -i /tmp/core.%f.%p \
+ -g /tmp/core.%f.%p \
+ -e global \
+ -e log \
+ -e proc-setid \
+ -e global-setid
+
+echo "input bins dir contains:"
+ls -ltr "$input"/bins || true
+
+banner unpack
+mkdir -p /var/tmp/bins
+for t in "$input/bins/"*.gz; do
+	b=$(basename "$t")
+	b=${b%.gz}
+	gunzip < "$t" > "/var/tmp/bins/$b"
+	chmod +x "/var/tmp/bins/$b"
+done
+
+export BINDIR=/var/tmp/bins
+
+# Give this test one hour to finish
+jobpid=$$; (sleep $(( 60 * 60 )); banner fail-timeout; ps -ef; zfs list;kill $jobpid) &
+
+echo "Setup debug logging"
+mkdir /tmp/debug
+psrinfo -v > /tmp/debug/psrinfo.txt
+df -h > /tmp/debug/df.txt || true
+prstat -d d -mLc 1 > /tmp/debug/prstat.txt 2>&1 &
+iostat -T d -xn 1 > /tmp/debug/iostat.txt 2>&1 &
+mpstat -T d 1 > /tmp/debug/mpstat.txt 2>&1 &
+vmstat -T d -p 1 < /dev/null > /tmp/debug/paging.txt 2>&1 &
+pfexec dtrace -Z -s $input/scripts/perf-downstairs-tick.d > /tmp/debug/dtrace.txt 2>&1 &
+pfexec dtrace -Z -s $input/scripts/upstairs_info.d > /tmp/debug/upstairs-info.txt 2>&1 &
+
+banner test_up_encrypted
+ptime -m bash "$input/scripts/test_up.sh" -r 2 encrypted
+
+echo "test-up-2region-encrypted ends"
diff --git a/.github/buildomat/jobs/test-up-encrypted.sh b/.github/buildomat/jobs/test-up-encrypted.sh
@@ -42,8 +42,8 @@ done
 
 export BINDIR=/var/tmp/bins
 
-# Give this test two hours to finish
-jobpid=$$; (sleep $(( 120 * 60 )); banner fail-timeout; ps -ef; zfs list;kill $jobpid) &
+# Give this test one hour to finish
+jobpid=$$; (sleep $(( 60 * 60 )); banner fail-timeout; ps -ef; zfs list;kill $jobpid) &
 
 echo "Setup debug logging"
 mkdir /tmp/debug
diff --git a/crutest/src/main.rs b/crutest/src/main.rs
@@ -114,6 +114,10 @@ enum Workload {
         #[clap(flatten)]
         cfg: RandReadWriteWorkload,
     },
+    /// Run IO, and as soon as we get a final ACK, drop the volume to
+    /// see if we can leave IOs outstanding on one of the downstairs.
+    /// This test works best if one of the downstairs is running with
+    /// lossy option set, which will make it go slower than the others.
     Repair,
     /// Test the downstairs replay path.
     /// Stop a downstairs, then run some IO, then start that downstairs back
@@ -1006,6 +1010,9 @@ async fn main() -> Result<()> {
     let (volume, mut targets) =
         make_a_volume(&opt, volume_logger.clone(), &test_log, pr).await?;
 
+    let downstairs_in_volume = targets.len() - (targets.len() % 3);
+    info!(test_log, "Downstairs in volume = {downstairs_in_volume}");
+
     if let Workload::CliServer { listen, port } = opt.workload {
         cli::start_cli_server(
             &volume,
@@ -1315,8 +1322,14 @@ async fn main() -> Result<()> {
                     bail!("Replay workload requires a dsc endpoint");
                 }
             };
-            replay_workload(&volume, &mut wtq, &mut region_info, dsc_client)
-                .await?;
+            replay_workload(
+                &volume,
+                &mut wtq,
+                &mut region_info,
+                dsc_client,
+                downstairs_in_volume as u32,
+            )
+            .await?;
         }
         Workload::Replace {
             fast_fill,
@@ -1367,6 +1380,18 @@ async fn main() -> Result<()> {
             // Add to the list of targets for our volume the replacement
             // target provided on the command line
             targets.push(replacement);
+
+            // Verify the number of targets dsc has matches what the number
+            // of targets we found.
+            let res = dsc_client.dsc_get_region_count().await.unwrap();
+            let region_count = res.into_inner();
+            if region_count != targets.len() as u32 {
+                bail!(
+                    "Downstairs targets:{} does not match dsc targets: {}",
+                    region_count,
+                    targets.len(),
+                );
+            }
             replace_before_active(
                 &volume,
                 wtq,
@@ -1401,6 +1426,18 @@ async fn main() -> Result<()> {
             // Add to the list of targets for our volume the replacement
             // target provided on the command line
             targets.push(replacement);
+
+            // Verify the number of targets dsc has matches what the number
+            // of targets we found.
+            let res = dsc_client.dsc_get_region_count().await.unwrap();
+            let region_count = res.into_inner();
+            if region_count != targets.len() as u32 {
+                bail!(
+                    "Downstairs targets:{} does not match dsc targets: {}",
+                    region_count,
+                    targets.len(),
+                );
+            }
             replace_while_reconcile(
                 &volume,
                 wtq,
@@ -1489,7 +1526,7 @@ async fn main() -> Result<()> {
             return Ok(());
         } else if opt.stable
             && wc.up_count + wc.ds_count == 0
-            && wc.active_count == 3
+            && wc.active_count == downstairs_in_volume
         {
             println!("CLIENT: All jobs finished, all DS active.");
             return Ok(());
@@ -2211,13 +2248,14 @@ async fn replay_workload(
     wtq: &mut WhenToQuit,
     ri: &mut RegionInfo,
     dsc_client: Client,
+    ds_count: u32,
 ) -> Result<()> {
     let mut rng = rand_chacha::ChaCha8Rng::from_entropy();
     let mut generic_wtq = WhenToQuit::Count { count: 300 };
 
     for c in 1.. {
         // Pick a DS at random
-        let stopped_ds = rng.gen_range(0..3);
+        let stopped_ds = rng.gen_range(0..ds_count);
         dsc_client.dsc_stop(stopped_ds).await.unwrap();
         loop {
             let res = dsc_client.dsc_get_ds_state(stopped_ds).await.unwrap();
@@ -2290,10 +2328,12 @@ async fn replace_while_reconcile(
     mut gen: u64,
     log: Logger,
 ) -> Result<()> {
-    assert!(targets.len() == 4);
+    assert!(targets.len() % 3 == 1);
 
+    // The total number of downstairs we have that are part of the Volume.
+    let ds_total = targets.len() - 1;
     let mut old_ds = 0;
-    let mut new_ds = 3;
+    let mut new_ds = targets.len() - 1;
     let mut c = 1;
     // How long we wait for reconcile to start before we replace
     let mut active_wait = 6;
@@ -2431,15 +2471,15 @@ async fn replace_while_reconcile(
                 wc.ds_count,
                 wc.active_count
             );
-            if wc.up_count + wc.ds_count == 0 && wc.active_count == 3 {
-                info!(log, "[{c}] Replay: All jobs finished, all DS active.");
+            if wc.up_count + wc.ds_count == 0 && wc.active_count == ds_total {
+                info!(log, "[{c}] All jobs finished, all DS active.");
                 break;
             }
             tokio::time::sleep(tokio::time::Duration::from_secs(4)).await;
         }
 
-        old_ds = (old_ds + 1) % 4;
-        new_ds = (new_ds + 1) % 4;
+        old_ds = (old_ds + 1) % (ds_total as u32 + 1);
+        new_ds = (new_ds + 1) % (ds_total + 1);
 
         c += 1;
         match wtq {
@@ -2485,11 +2525,18 @@ async fn replace_before_active(
     mut gen: u64,
     log: Logger,
 ) -> Result<()> {
-    assert!(targets.len() == 4);
+    assert!(targets.len() % 3 == 1);
 
     info!(log, "Begin replacement before activation test");
+    // We need to start from a known state and be sure that all three of the
+    // current downstairs are consistent with each other. To guarantee this
+    // we write to every block, then flush, then read.  This way we know
+    // that the initial downstairs are all synced up on the same flush and
+    // generation numbers.
+    fill_workload(volume, ri, true).await?;
+    let ds_total = targets.len() - 1;
     let mut old_ds = 0;
-    let mut new_ds = 3;
+    let mut new_ds = targets.len() - 1;
     for c in 1.. {
         info!(log, "[{c}] Touch every extent");
         fill_sparse_workload(volume.as_ref(), ri).await?;
@@ -2586,15 +2633,15 @@ async fn replace_before_active(
                 wc.ds_count,
                 wc.active_count
             );
-            if wc.up_count + wc.ds_count == 0 && wc.active_count == 3 {
-                info!(log, "[{c}] Replay: All jobs finished, all DS active.");
+            if wc.up_count + wc.ds_count == 0 && wc.active_count == ds_total {
+                info!(log, "[{c}] All jobs finished, all DS active.");
                 break;
             }
             tokio::time::sleep(tokio::time::Duration::from_secs(4)).await;
         }
 
-        old_ds = (old_ds + 1) % 4;
-        new_ds = (new_ds + 1) % 4;
+        old_ds = (old_ds + 1) % (ds_total as u32 + 1);
+        new_ds = (new_ds + 1) % (ds_total + 1);
 
         match wtq {
             WhenToQuit::Count { count } => {
@@ -2637,7 +2684,10 @@ async fn replace_workload(
     targets: Vec<SocketAddr>,
     fill: bool,
 ) -> Result<()> {
-    assert!(targets.len() == 4);
+    assert!(targets.len() % 3 == 1);
+
+    // The total number of downstairs we have that are part of the Volume.
+    let ds_total = targets.len() - 1;
 
     if fill {
         fill_sparse_workload(volume.as_ref(), ri).await?;
@@ -2656,7 +2706,7 @@ async fn replace_workload(
     let volume_c = volume.clone();
     let handle = tokio::spawn(async move {
         let mut old_ds = 0;
-        let mut new_ds = 3;
+        let mut new_ds = ds_total;
         let mut c = 1;
         loop {
             println!(
@@ -2678,7 +2728,7 @@ async fn replace_workload(
             }
             // Wait for the replacement to be reflected in the downstairs status.
             let mut wc = volume_c.show_work().await?;
-            while wc.active_count == 3 {
+            while wc.active_count == ds_total {
                 // Wait for one of the DS to start repair
                 println!(
                     "[{c}] Waiting for replace to start: up:{} ds:{} act:{}",
@@ -2689,7 +2739,7 @@ async fn replace_workload(
             }
 
             // We have started live repair, now wait for it to finish.
-            while wc.active_count != 3 {
+            while wc.active_count != ds_total {
                 println!(
                     "[{c}] Waiting for replace to finish: up:{} ds:{} act:{}",
                     wc.up_count, wc.ds_count, wc.active_count
@@ -2714,8 +2764,8 @@ async fn replace_workload(
             }
 
             // No stopping yet, let's do another loop.
-            old_ds = (old_ds + 1) % 4;
-            new_ds = (new_ds + 1) % 4;
+            old_ds = (old_ds + 1) % (ds_total + 1);
+            new_ds = (new_ds + 1) % (ds_total + 1);
             c += 1;
         }
         println!("Replace tasks ends after {c} loops");
@@ -2771,7 +2821,7 @@ async fn replace_workload(
             "Replace test done: up:{} ds:{} act:{}",
             wc.up_count, wc.ds_count, wc.active_count
         );
-        if wc.up_count + wc.ds_count == 0 && wc.active_count == 3 {
+        if wc.up_count + wc.ds_count == 0 && wc.active_count == ds_total {
             println!("Replace: All jobs finished, all DS active.");
             break;
         }
diff --git a/tools/dtrace/single_up_info.d b/tools/dtrace/single_up_info.d
@@ -41,7 +41,7 @@ crucible_upstairs*:::up-status
      * I'm not very happy about this, but if we don't print it all on one
      * line, then multiple sessions will clobber each others output.
      */
-    printf("%8s %17s %17s %17s %5s %5s %9s %5s %10s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s\n",
+    printf("%8s %17s %17s %17s %5s %5s %9s %5s %10s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s\n",
 
     substr(session_id, 0, 8),
 
diff --git a/tools/test_fail_live_repair.sh b/tools/test_fail_live_repair.sh
@@ -6,11 +6,11 @@
 #  `pstop` or kill the downstairs process
 #  wait for missing downstairs to be faulted
 #  `pstart` or restart the downstairs
-#  Let the repair start.
+#  Let the live repair start.
 #  `pstop` or kill the same downstairs again.
 #  wait for missing downstairs to be faulted again
 #  `pstart` or restart the downstairs again
-#  Let the upstairs repair start over and finish.
+#  Let the upstairs live repair start over and finish.
 #  Stop crutest, then restart and verify the whole disk.
 
 err=0
diff --git a/tools/test_repair.sh b/tools/test_repair.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# A test to break, then Repair a downstairs region that is out of sync with
+# A test to break, then reconcile a downstairs region that is out of sync with
 # the other regions. We pick a downstairs at random and restart it with
 # the --lossy flag, meaning it will skip some IO requests (and have to
 # come back and do them later) and will introduce some delay in completing
diff --git a/tools/test_up.sh b/tools/test_up.sh
diff --git a/upstairs/src/client.rs b/upstairs/src/client.rs