Skip to content

Commit

Permalink
Add checkpoint restore metrics.
Browse files Browse the repository at this point in the history
- SandboxCheckpointedMetric and SandboxRestoredMetric are added for sandbox
metadata metric which indicate whether sandbox is checkpointed or restored.
- NumCheckpointedSandboxesMetric and NumRestoredSandboxesMetric are added at
the process level which will get the count of number of sandboxes being
checkpointed and restored.

PiperOrigin-RevId: 721408284
  • Loading branch information
nybidari authored and gvisor-bot committed Feb 10, 2025
1 parent faac8f3 commit b9c9417
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 4 deletions.
74 changes: 74 additions & 0 deletions runsc/container/container_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,14 @@ func testCheckpointRestore(t *testing.T, conf *config.Config, compression statef
t.Fatalf("error restoring container: %v", err)
}

if !cont2.Sandbox.Restored {
t.Fatalf("sandbox returned wrong value for Sandbox.Restored, got: false, want: true")
}

if cont2.Sandbox.Checkpointed {
t.Fatalf("sandbox returned wrong value for Sandbox.Checkpointed, got: true, want: false")
}

// Wait until application has ran.
if err := waitForFileNotEmpty(outputFile2); err != nil {
t.Fatalf("Failed to wait for output file: %v", err)
Expand Down Expand Up @@ -3968,3 +3976,69 @@ func TestSpecValidationIgnore(t *testing.T) {
t.Fatalf("spec validation was not ignored, got: %v, want: nil", err)
}
}

func TestCheckpointResume(t *testing.T) {
for name, conf := range configs(t, true /* noOverlay */) {
t.Run(name, func(t *testing.T) {
dir, err := os.MkdirTemp(testutil.TmpDir(), "checkpoint-test")
if err != nil {
t.Fatalf("os.MkdirTemp failed: %v", err)
}
defer os.RemoveAll(dir)
if err := os.Chmod(dir, 0777); err != nil {
t.Fatalf("error chmoding file: %q, %v", dir, err)
}

outputPath := filepath.Join(dir, "output")
outputFile, err := createWriteableOutputFile(outputPath)
if err != nil {
t.Fatalf("error creating output file: %v", err)
}
defer outputFile.Close()

script := fmt.Sprintf("i=0; while true; do echo $i >> %q; sleep 1; i=$((i+1)); done", outputPath)
spec := testutil.NewSpecWithArgs("bash", "-c", script)
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
if err != nil {
t.Fatalf("error setting up container: %v", err)
}
defer cleanup()

// Create and start the container.
args := Args{
ID: testutil.RandomContainerID(),
Spec: spec,
BundleDir: bundleDir,
}
cont, err := New(conf, args)
if err != nil {
t.Fatalf("error creating container: %v", err)
}
if err := cont.Start(conf); err != nil {
t.Fatalf("error starting container: %v", err)
}

// Wait until application has ran.
if err := waitForFileNotEmpty(outputFile); err != nil {
t.Fatalf("Failed to wait for output file: %v", err)
}

sfOpts := statefile.Options{
Resume: true,
}
// Checkpoint running container; save state into new file.
if err := cont.Checkpoint(dir, false /* direct */, sfOpts, pgalloc.SaveOpts{}); err != nil {
t.Fatalf("error checkpointing container to empty file: %v", err)
}

if !cont.Sandbox.Checkpointed {
t.Fatalf("sandbox returned wrong value for Sandbox.Checkpointed, got: false, want: true")
}

if cont.Sandbox.Restored {
t.Fatalf("sandbox returned wrong value for Sandbox.Restored, got: true, want: false")
}
cont.Destroy()
})
}
}
30 changes: 27 additions & 3 deletions runsc/metricserver/metricserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -531,9 +531,11 @@ func (m *metricServer) loadSandboxesLocked(ctx context.Context) []sandboxLoadRes
// metric data (if querying metrics from the sandbox process succeeded).
type sandboxMetricsResult struct {
sandboxLoadResult
isRunning bool
snapshot *prometheus.Snapshot
err error
isRunning bool
isCheckpointed bool
isRestored bool
snapshot *prometheus.Snapshot
err error
}

// queryMultiSandboxMetrics queries metric data from multiple loaded sandboxes.
Expand Down Expand Up @@ -570,17 +572,23 @@ func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoad
defer wg.Done()
for s := range loadedSandboxCh {
isRunning := false
isCheckpointed := false
isRestored := false
var snapshot *prometheus.Snapshot
err := s.err
if err == nil {
queryCtx, queryCtxCancel := context.WithTimeout(ctx, perSandboxTime)
snapshot, err = querySandboxMetrics(queryCtx, s.sandbox, s.verifier, metricsFilter)
queryCtxCancel()
isRunning = s.sandbox.IsRunning()
isCheckpointed = s.sandbox.Checkpointed
isRestored = s.sandbox.Restored
}
processSandbox(sandboxMetricsResult{
sandboxLoadResult: s,
isRunning: isRunning,
isCheckpointed: isCheckpointed,
isRestored: isRestored,
snapshot: snapshot,
err: err,
})
Expand Down Expand Up @@ -660,6 +668,8 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht
type metaMetrics struct {
numRunningSandboxes int64
numCannotExportSandboxes int64
numCheckpointedSandboxes int64
numRestoredSandboxes int64
}
meta := metaMetrics{} // Protected by metricsMu.
selfMetrics := prometheus.NewSnapshot() // Protected by metricsMu.
Expand All @@ -675,11 +685,23 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht
defer metricsMu.Unlock()
selfMetrics.Add(prometheus.LabeledIntData(&SandboxPresenceMetric, nil, 1).SetExternalLabels(r.served.extraLabels))
sandboxRunning := int64(0)
sandboxCheckpointed := int64(0)
sandboxRestored := int64(0)
if r.isRunning {
sandboxRunning = 1
meta.numRunningSandboxes++
}
if r.isCheckpointed {
sandboxCheckpointed = 1
meta.numCheckpointedSandboxes++
}
if r.isRestored {
sandboxRestored = 1
meta.numRestoredSandboxes++
}
selfMetrics.Add(prometheus.LabeledIntData(&SandboxRunningMetric, nil, sandboxRunning).SetExternalLabels(r.served.extraLabels))
selfMetrics.Add(prometheus.LabeledIntData(&SandboxCheckpointedMetric, nil, sandboxCheckpointed).SetExternalLabels(r.served.extraLabels))
selfMetrics.Add(prometheus.LabeledIntData(&SandboxRestoredMetric, nil, sandboxRestored).SetExternalLabels(r.served.extraLabels))
if r.err == nil {
selfMetrics.Add(prometheus.LabeledIntData(&SandboxMetadataMetric, r.sandbox.MetricMetadata, 1).SetExternalLabels(r.served.extraLabels))
for _, cap := range r.served.capabilities {
Expand Down Expand Up @@ -732,6 +754,8 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht
selfMetrics.Add(prometheus.NewIntData(&NumRunningSandboxesMetric, meta.numRunningSandboxes))
selfMetrics.Add(prometheus.NewIntData(&NumCannotExportSandboxesMetric, meta.numCannotExportSandboxes))
selfMetrics.Add(prometheus.NewIntData(&NumTotalSandboxesMetric, numSandboxesTotal))
selfMetrics.Add(prometheus.NewIntData(&NumCheckpointedSandboxesMetric, meta.numCheckpointedSandboxes))
selfMetrics.Add(prometheus.NewIntData(&NumRestoredSandboxesMetric, meta.numRestoredSandboxes))

// Write out all data.
lastMetricsWrittenSize := int(m.lastMetricsWrittenSize.Load())
Expand Down
22 changes: 22 additions & 0 deletions runsc/metricserver/metricserver_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ var (
Type: prometheus.TypeGauge,
Help: "Boolean metric set to 1 for each running sandbox.",
}
SandboxCheckpointedMetric = prometheus.Metric{
Name: "sandbox_checkpointed",
Type: prometheus.TypeGauge,
Help: "Boolean metric set to 1 for each checkpointed sandbox.",
}
SandboxRestoredMetric = prometheus.Metric{
Name: "sandbox_restored",
Type: prometheus.TypeGauge,
Help: "Boolean metric set to 1 for each restored sandbox.",
}
SandboxMetadataMetric = prometheus.Metric{
Name: "sandbox_metadata",
Type: prometheus.TypeGauge,
Expand Down Expand Up @@ -66,6 +76,16 @@ var (
Type: prometheus.TypeCounter,
Help: "Counter of sandboxes that have ever been started.",
}
NumCheckpointedSandboxesMetric = prometheus.Metric{
Name: "num_sandboxes_checkpointed",
Type: prometheus.TypeCounter,
Help: "Counter of sandboxes that have been checkpointed.",
}
NumRestoredSandboxesMetric = prometheus.Metric{
Name: "num_sandboxes_restored",
Type: prometheus.TypeCounter,
Help: "Counter of sandboxes that have been restored.",
}
)

// Metrics is a list of metrics that the metric server generates.
Expand All @@ -79,5 +99,7 @@ var Metrics = []*prometheus.Metric{
&NumRunningSandboxesMetric,
&NumCannotExportSandboxesMetric,
&NumTotalSandboxesMetric,
&NumCheckpointedSandboxesMetric,
&NumRestoredSandboxesMetric,
&prometheus.ProcessStartTimeSeconds,
}
9 changes: 8 additions & 1 deletion runsc/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,12 @@ type Sandbox struct {
// threads to wait on sandbox and get the exit code, since Linux will return
// WaitStatus to one of the waiters only.
status unix.WaitStatus `nojson:"true"`

// Checkpointed will be true when the sandbox has been checkpointed.
Checkpointed bool `json:"checkpointed"`

// Restored will be true when the sandbox has been restored.
Restored bool `json:"restored"`
}

// Getpid returns the process ID of the sandbox process.
Expand Down Expand Up @@ -537,7 +543,7 @@ func (s *Sandbox) Restore(conf *config.Config, cid string, imagePath string, dir
if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil {
return fmt.Errorf("restoring container %q: %v", cid, err)
}

s.Restored = true
return nil
}

Expand Down Expand Up @@ -1439,6 +1445,7 @@ func (s *Sandbox) Checkpoint(cid string, imagePath string, direct bool, sfOpts s
if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil {
return fmt.Errorf("checkpointing container %q: %w", cid, err)
}
s.Checkpointed = true
return nil
}

Expand Down

0 comments on commit b9c9417

Please sign in to comment.