diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index f0825324e2..c9e51f1cc1 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -1106,6 +1106,14 @@ func testCheckpointRestore(t *testing.T, conf *config.Config, compression statef t.Fatalf("error restoring container: %v", err) } + if !cont2.Sandbox.Restored { + t.Fatalf("sandbox returned wrong value for Sandbox.Restored, got: false, want: true") + } + + if cont2.Sandbox.Checkpointed { + t.Fatalf("sandbox returned wrong value for Sandbox.Checkpointed, got: true, want: false") + } + // Wait until application has ran. if err := waitForFileNotEmpty(outputFile2); err != nil { t.Fatalf("Failed to wait for output file: %v", err) @@ -3968,3 +3976,69 @@ func TestSpecValidationIgnore(t *testing.T) { t.Fatalf("spec validation was not ignored, got: %v, want: nil", err) } } + +func TestCheckpointResume(t *testing.T) { + for name, conf := range configs(t, true /* noOverlay */) { + t.Run(name, func(t *testing.T) { + dir, err := os.MkdirTemp(testutil.TmpDir(), "checkpoint-test") + if err != nil { + t.Fatalf("os.MkdirTemp failed: %v", err) + } + defer os.RemoveAll(dir) + if err := os.Chmod(dir, 0777); err != nil { + t.Fatalf("error chmoding file: %q, %v", dir, err) + } + + outputPath := filepath.Join(dir, "output") + outputFile, err := createWriteableOutputFile(outputPath) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile.Close() + + script := fmt.Sprintf("i=0; while true; do echo $i >> %q; sleep 1; i=$((i+1)); done", outputPath) + spec := testutil.NewSpecWithArgs("bash", "-c", script) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } + + sfOpts := statefile.Options{ + Resume: true, + } + // Checkpoint running container; save state into new file. + if err := cont.Checkpoint(dir, false /* direct */, sfOpts, pgalloc.SaveOpts{}); err != nil { + t.Fatalf("error checkpointing container to empty file: %v", err) + } + + if !cont.Sandbox.Checkpointed { + t.Fatalf("sandbox returned wrong value for Sandbox.Checkpointed, got: false, want: true") + } + + if cont.Sandbox.Restored { + t.Fatalf("sandbox returned wrong value for Sandbox.Restored, got: true, want: false") + } + cont.Destroy() + }) + } +} diff --git a/runsc/metricserver/metricserver.go b/runsc/metricserver/metricserver.go index dd75fcb726..efd4db644e 100644 --- a/runsc/metricserver/metricserver.go +++ b/runsc/metricserver/metricserver.go @@ -531,9 +531,11 @@ func (m *metricServer) loadSandboxesLocked(ctx context.Context) []sandboxLoadRes // metric data (if querying metrics from the sandbox process succeeded). type sandboxMetricsResult struct { sandboxLoadResult - isRunning bool - snapshot *prometheus.Snapshot - err error + isRunning bool + isCheckpointed bool + isRestored bool + snapshot *prometheus.Snapshot + err error } // queryMultiSandboxMetrics queries metric data from multiple loaded sandboxes. @@ -570,6 +572,8 @@ func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoad defer wg.Done() for s := range loadedSandboxCh { isRunning := false + isCheckpointed := false + isRestored := false var snapshot *prometheus.Snapshot err := s.err if err == nil { @@ -577,10 +581,14 @@ func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoad snapshot, err = querySandboxMetrics(queryCtx, s.sandbox, s.verifier, metricsFilter) queryCtxCancel() isRunning = s.sandbox.IsRunning() + isCheckpointed = s.sandbox.Checkpointed + isRestored = s.sandbox.Restored } processSandbox(sandboxMetricsResult{ sandboxLoadResult: s, isRunning: isRunning, + isCheckpointed: isCheckpointed, + isRestored: isRestored, snapshot: snapshot, err: err, }) @@ -660,6 +668,8 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht type metaMetrics struct { numRunningSandboxes int64 numCannotExportSandboxes int64 + numCheckpointedSandboxes int64 + numRestoredSandboxes int64 } meta := metaMetrics{} // Protected by metricsMu. selfMetrics := prometheus.NewSnapshot() // Protected by metricsMu. @@ -675,11 +685,23 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht defer metricsMu.Unlock() selfMetrics.Add(prometheus.LabeledIntData(&SandboxPresenceMetric, nil, 1).SetExternalLabels(r.served.extraLabels)) sandboxRunning := int64(0) + sandboxCheckpointed := int64(0) + sandboxRestored := int64(0) if r.isRunning { sandboxRunning = 1 meta.numRunningSandboxes++ } + if r.isCheckpointed { + sandboxCheckpointed = 1 + meta.numCheckpointedSandboxes++ + } + if r.isRestored { + sandboxRestored = 1 + meta.numRestoredSandboxes++ + } selfMetrics.Add(prometheus.LabeledIntData(&SandboxRunningMetric, nil, sandboxRunning).SetExternalLabels(r.served.extraLabels)) + selfMetrics.Add(prometheus.LabeledIntData(&SandboxCheckpointedMetric, nil, sandboxCheckpointed).SetExternalLabels(r.served.extraLabels)) + selfMetrics.Add(prometheus.LabeledIntData(&SandboxRestoredMetric, nil, sandboxRestored).SetExternalLabels(r.served.extraLabels)) if r.err == nil { selfMetrics.Add(prometheus.LabeledIntData(&SandboxMetadataMetric, r.sandbox.MetricMetadata, 1).SetExternalLabels(r.served.extraLabels)) for _, cap := range r.served.capabilities { @@ -732,6 +754,8 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht selfMetrics.Add(prometheus.NewIntData(&NumRunningSandboxesMetric, meta.numRunningSandboxes)) selfMetrics.Add(prometheus.NewIntData(&NumCannotExportSandboxesMetric, meta.numCannotExportSandboxes)) selfMetrics.Add(prometheus.NewIntData(&NumTotalSandboxesMetric, numSandboxesTotal)) + selfMetrics.Add(prometheus.NewIntData(&NumCheckpointedSandboxesMetric, meta.numCheckpointedSandboxes)) + selfMetrics.Add(prometheus.NewIntData(&NumRestoredSandboxesMetric, meta.numRestoredSandboxes)) // Write out all data. lastMetricsWrittenSize := int(m.lastMetricsWrittenSize.Load()) diff --git a/runsc/metricserver/metricserver_metrics.go b/runsc/metricserver/metricserver_metrics.go index a8d58c4eee..7dddaad026 100644 --- a/runsc/metricserver/metricserver_metrics.go +++ b/runsc/metricserver/metricserver_metrics.go @@ -30,6 +30,16 @@ var ( Type: prometheus.TypeGauge, Help: "Boolean metric set to 1 for each running sandbox.", } + SandboxCheckpointedMetric = prometheus.Metric{ + Name: "sandbox_checkpointed", + Type: prometheus.TypeGauge, + Help: "Boolean metric set to 1 for each checkpointed sandbox.", + } + SandboxRestoredMetric = prometheus.Metric{ + Name: "sandbox_restored", + Type: prometheus.TypeGauge, + Help: "Boolean metric set to 1 for each restored sandbox.", + } SandboxMetadataMetric = prometheus.Metric{ Name: "sandbox_metadata", Type: prometheus.TypeGauge, @@ -66,6 +76,16 @@ var ( Type: prometheus.TypeCounter, Help: "Counter of sandboxes that have ever been started.", } + NumCheckpointedSandboxesMetric = prometheus.Metric{ + Name: "num_sandboxes_checkpointed", + Type: prometheus.TypeCounter, + Help: "Counter of sandboxes that have been checkpointed.", + } + NumRestoredSandboxesMetric = prometheus.Metric{ + Name: "num_sandboxes_restored", + Type: prometheus.TypeCounter, + Help: "Counter of sandboxes that have been restored.", + } ) // Metrics is a list of metrics that the metric server generates. @@ -79,5 +99,7 @@ var Metrics = []*prometheus.Metric{ &NumRunningSandboxesMetric, &NumCannotExportSandboxesMetric, &NumTotalSandboxesMetric, + &NumCheckpointedSandboxesMetric, + &NumRestoredSandboxesMetric, &prometheus.ProcessStartTimeSeconds, } diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 05a58530b6..639a770064 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -219,6 +219,12 @@ type Sandbox struct { // threads to wait on sandbox and get the exit code, since Linux will return // WaitStatus to one of the waiters only. status unix.WaitStatus `nojson:"true"` + + // Checkpointed will be true when the sandbox has been checkpointed. + Checkpointed bool `json:"checkpointed"` + + // Restored will be true when the sandbox has been restored. + Restored bool `json:"restored"` } // Getpid returns the process ID of the sandbox process. @@ -537,7 +543,7 @@ func (s *Sandbox) Restore(conf *config.Config, cid string, imagePath string, dir if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil { return fmt.Errorf("restoring container %q: %v", cid, err) } - + s.Restored = true return nil } @@ -1439,6 +1445,7 @@ func (s *Sandbox) Checkpoint(cid string, imagePath string, direct bool, sfOpts s if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil { return fmt.Errorf("checkpointing container %q: %w", cid, err) } + s.Checkpointed = true return nil }