Skip to content

Commit 9b26edb

Browse files
nybidarigvisor-bot
authored andcommitted
Add checkpoint restore metrics.
- SandboxCheckpointedMetric and SandboxRestoredMetric are added for sandbox metadata metric which indicate whether sandbox is checkpointed or restored. - NumCheckpointedSandboxesMetric and NumRestoredSandboxesMetric are added at the process level which will get the count of number of sandboxes being checkpointed and restored. PiperOrigin-RevId: 721408284
1 parent d949e71 commit 9b26edb

File tree

4 files changed

+131
-4
lines changed

4 files changed

+131
-4
lines changed

runsc/container/container_test.go

+74
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,14 @@ func testCheckpointRestore(t *testing.T, conf *config.Config, compression statef
11061106
t.Fatalf("error restoring container: %v", err)
11071107
}
11081108

1109+
if !cont2.Sandbox.Restored {
1110+
t.Fatalf("sandbox returned wrong value for Sandbox.Restored, got: false, want: true")
1111+
}
1112+
1113+
if cont2.Sandbox.Checkpointed {
1114+
t.Fatalf("sandbox returned wrong value for Sandbox.Checkpointed, got: true, want: false")
1115+
}
1116+
11091117
// Wait until application has ran.
11101118
if err := waitForFileNotEmpty(outputFile2); err != nil {
11111119
t.Fatalf("Failed to wait for output file: %v", err)
@@ -3968,3 +3976,69 @@ func TestSpecValidationIgnore(t *testing.T) {
39683976
t.Fatalf("spec validation was not ignored, got: %v, want: nil", err)
39693977
}
39703978
}
3979+
3980+
func TestCheckpointResume(t *testing.T) {
3981+
for name, conf := range configs(t, true /* noOverlay */) {
3982+
t.Run(name, func(t *testing.T) {
3983+
dir, err := os.MkdirTemp(testutil.TmpDir(), "checkpoint-test")
3984+
if err != nil {
3985+
t.Fatalf("os.MkdirTemp failed: %v", err)
3986+
}
3987+
defer os.RemoveAll(dir)
3988+
if err := os.Chmod(dir, 0777); err != nil {
3989+
t.Fatalf("error chmoding file: %q, %v", dir, err)
3990+
}
3991+
3992+
outputPath := filepath.Join(dir, "output")
3993+
outputFile, err := createWriteableOutputFile(outputPath)
3994+
if err != nil {
3995+
t.Fatalf("error creating output file: %v", err)
3996+
}
3997+
defer outputFile.Close()
3998+
3999+
script := fmt.Sprintf("i=0; while true; do echo $i >> %q; sleep 1; i=$((i+1)); done", outputPath)
4000+
spec := testutil.NewSpecWithArgs("bash", "-c", script)
4001+
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
4002+
if err != nil {
4003+
t.Fatalf("error setting up container: %v", err)
4004+
}
4005+
defer cleanup()
4006+
4007+
// Create and start the container.
4008+
args := Args{
4009+
ID: testutil.RandomContainerID(),
4010+
Spec: spec,
4011+
BundleDir: bundleDir,
4012+
}
4013+
cont, err := New(conf, args)
4014+
if err != nil {
4015+
t.Fatalf("error creating container: %v", err)
4016+
}
4017+
if err := cont.Start(conf); err != nil {
4018+
t.Fatalf("error starting container: %v", err)
4019+
}
4020+
4021+
// Wait until application has ran.
4022+
if err := waitForFileNotEmpty(outputFile); err != nil {
4023+
t.Fatalf("Failed to wait for output file: %v", err)
4024+
}
4025+
4026+
sfOpts := statefile.Options{
4027+
Resume: true,
4028+
}
4029+
// Checkpoint running container; save state into new file.
4030+
if err := cont.Checkpoint(dir, false /* direct */, sfOpts, pgalloc.SaveOpts{}); err != nil {
4031+
t.Fatalf("error checkpointing container to empty file: %v", err)
4032+
}
4033+
4034+
if !cont.Sandbox.Checkpointed {
4035+
t.Fatalf("sandbox returned wrong value for Sandbox.Checkpointed, got: false, want: true")
4036+
}
4037+
4038+
if cont.Sandbox.Restored {
4039+
t.Fatalf("sandbox returned wrong value for Sandbox.Restored, got: true, want: false")
4040+
}
4041+
cont.Destroy()
4042+
})
4043+
}
4044+
}

runsc/metricserver/metricserver.go

+27-3
Original file line numberDiff line numberDiff line change
@@ -531,9 +531,11 @@ func (m *metricServer) loadSandboxesLocked(ctx context.Context) []sandboxLoadRes
531531
// metric data (if querying metrics from the sandbox process succeeded).
532532
type sandboxMetricsResult struct {
533533
sandboxLoadResult
534-
isRunning bool
535-
snapshot *prometheus.Snapshot
536-
err error
534+
isRunning bool
535+
isCheckpointed bool
536+
isRestored bool
537+
snapshot *prometheus.Snapshot
538+
err error
537539
}
538540

539541
// queryMultiSandboxMetrics queries metric data from multiple loaded sandboxes.
@@ -570,17 +572,23 @@ func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoad
570572
defer wg.Done()
571573
for s := range loadedSandboxCh {
572574
isRunning := false
575+
isCheckpointed := false
576+
isRestored := false
573577
var snapshot *prometheus.Snapshot
574578
err := s.err
575579
if err == nil {
576580
queryCtx, queryCtxCancel := context.WithTimeout(ctx, perSandboxTime)
577581
snapshot, err = querySandboxMetrics(queryCtx, s.sandbox, s.verifier, metricsFilter)
578582
queryCtxCancel()
579583
isRunning = s.sandbox.IsRunning()
584+
isCheckpointed = s.sandbox.Checkpointed
585+
isRestored = s.sandbox.Restored
580586
}
581587
processSandbox(sandboxMetricsResult{
582588
sandboxLoadResult: s,
583589
isRunning: isRunning,
590+
isCheckpointed: isCheckpointed,
591+
isRestored: isRestored,
584592
snapshot: snapshot,
585593
err: err,
586594
})
@@ -660,6 +668,8 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht
660668
type metaMetrics struct {
661669
numRunningSandboxes int64
662670
numCannotExportSandboxes int64
671+
numCheckpointedSandboxes int64
672+
numRestoredSandboxes int64
663673
}
664674
meta := metaMetrics{} // Protected by metricsMu.
665675
selfMetrics := prometheus.NewSnapshot() // Protected by metricsMu.
@@ -675,11 +685,23 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht
675685
defer metricsMu.Unlock()
676686
selfMetrics.Add(prometheus.LabeledIntData(&SandboxPresenceMetric, nil, 1).SetExternalLabels(r.served.extraLabels))
677687
sandboxRunning := int64(0)
688+
sandboxCheckpointed := int64(0)
689+
sandboxRestored := int64(0)
678690
if r.isRunning {
679691
sandboxRunning = 1
680692
meta.numRunningSandboxes++
681693
}
694+
if r.isCheckpointed {
695+
sandboxCheckpointed = 1
696+
meta.numCheckpointedSandboxes++
697+
}
698+
if r.isRestored {
699+
sandboxRestored = 1
700+
meta.numRestoredSandboxes++
701+
}
682702
selfMetrics.Add(prometheus.LabeledIntData(&SandboxRunningMetric, nil, sandboxRunning).SetExternalLabels(r.served.extraLabels))
703+
selfMetrics.Add(prometheus.LabeledIntData(&SandboxCheckpointedMetric, nil, sandboxCheckpointed).SetExternalLabels(r.served.extraLabels))
704+
selfMetrics.Add(prometheus.LabeledIntData(&SandboxRestoredMetric, nil, sandboxRestored).SetExternalLabels(r.served.extraLabels))
683705
if r.err == nil {
684706
selfMetrics.Add(prometheus.LabeledIntData(&SandboxMetadataMetric, r.sandbox.MetricMetadata, 1).SetExternalLabels(r.served.extraLabels))
685707
for _, cap := range r.served.capabilities {
@@ -732,6 +754,8 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht
732754
selfMetrics.Add(prometheus.NewIntData(&NumRunningSandboxesMetric, meta.numRunningSandboxes))
733755
selfMetrics.Add(prometheus.NewIntData(&NumCannotExportSandboxesMetric, meta.numCannotExportSandboxes))
734756
selfMetrics.Add(prometheus.NewIntData(&NumTotalSandboxesMetric, numSandboxesTotal))
757+
selfMetrics.Add(prometheus.NewIntData(&NumCheckpointedSandboxesMetric, meta.numCheckpointedSandboxes))
758+
selfMetrics.Add(prometheus.NewIntData(&NumRestoredSandboxesMetric, meta.numRestoredSandboxes))
735759

736760
// Write out all data.
737761
lastMetricsWrittenSize := int(m.lastMetricsWrittenSize.Load())

runsc/metricserver/metricserver_metrics.go

+22
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,16 @@ var (
3030
Type: prometheus.TypeGauge,
3131
Help: "Boolean metric set to 1 for each running sandbox.",
3232
}
33+
SandboxCheckpointedMetric = prometheus.Metric{
34+
Name: "sandbox_checkpointed",
35+
Type: prometheus.TypeGauge,
36+
Help: "Boolean metric set to 1 for each checkpointed sandbox.",
37+
}
38+
SandboxRestoredMetric = prometheus.Metric{
39+
Name: "sandbox_restored",
40+
Type: prometheus.TypeGauge,
41+
Help: "Boolean metric set to 1 for each restored sandbox.",
42+
}
3343
SandboxMetadataMetric = prometheus.Metric{
3444
Name: "sandbox_metadata",
3545
Type: prometheus.TypeGauge,
@@ -66,6 +76,16 @@ var (
6676
Type: prometheus.TypeCounter,
6777
Help: "Counter of sandboxes that have ever been started.",
6878
}
79+
NumCheckpointedSandboxesMetric = prometheus.Metric{
80+
Name: "num_sandboxes_checkpointed",
81+
Type: prometheus.TypeCounter,
82+
Help: "Counter of sandboxes that have been checkpointed.",
83+
}
84+
NumRestoredSandboxesMetric = prometheus.Metric{
85+
Name: "num_sandboxes_restored",
86+
Type: prometheus.TypeCounter,
87+
Help: "Counter of sandboxes that have been restored.",
88+
}
6989
)
7090

7191
// Metrics is a list of metrics that the metric server generates.
@@ -79,5 +99,7 @@ var Metrics = []*prometheus.Metric{
7999
&NumRunningSandboxesMetric,
80100
&NumCannotExportSandboxesMetric,
81101
&NumTotalSandboxesMetric,
102+
&NumCheckpointedSandboxesMetric,
103+
&NumRestoredSandboxesMetric,
82104
&prometheus.ProcessStartTimeSeconds,
83105
}

runsc/sandbox/sandbox.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,12 @@ type Sandbox struct {
219219
// threads to wait on sandbox and get the exit code, since Linux will return
220220
// WaitStatus to one of the waiters only.
221221
status unix.WaitStatus `nojson:"true"`
222+
223+
// Checkpointed will be true when the sandbox has been checkpointed.
224+
Checkpointed bool `json:"checkpointed"`
225+
226+
// Restored will be true when the sandbox has been restored.
227+
Restored bool `json:"restored"`
222228
}
223229

224230
// Getpid returns the process ID of the sandbox process.
@@ -537,7 +543,7 @@ func (s *Sandbox) Restore(conf *config.Config, cid string, imagePath string, dir
537543
if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil {
538544
return fmt.Errorf("restoring container %q: %v", cid, err)
539545
}
540-
546+
s.Restored = true
541547
return nil
542548
}
543549

@@ -1439,6 +1445,7 @@ func (s *Sandbox) Checkpoint(cid string, imagePath string, direct bool, sfOpts s
14391445
if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil {
14401446
return fmt.Errorf("checkpointing container %q: %w", cid, err)
14411447
}
1448+
s.Checkpointed = true
14421449
return nil
14431450
}
14441451

0 commit comments

Comments
 (0)