Skip to content

Commit

Permalink
chore: add CPU profile to telemetry heartbeat (#625)
Browse files Browse the repository at this point in the history
# Description

Adding CPU profile to heartbeat to determine user/sys cpu time over a
given window

## Related Issue

If this pull request is related to any issue, please mention it here.
Additionally, make sure that the issue is assigned to you before
submitting this pull request.

## Checklist

- [ ] I have read the [contributing
documentation](https://retina.sh/docs/contributing).
- [ ] I signed and signed-off the commits (`git commit -S -s ...`). See
[this
documentation](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification)
on signing commits.
- [ ] I have correctly attributed the author(s) of the code.
- [ ] I have tested the changes locally.
- [ ] I have followed the project's style guidelines.
- [ ] I have updated the documentation, if necessary.
- [ ] I have added tests, if applicable.

## Screenshots (if applicable) or Testing Completed

Please add any relevant screenshots or GIFs to showcase the changes
made.

## Additional Notes

Add any additional notes or context about the pull request here.

---

Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) file for more
information on how to contribute to this project.
  • Loading branch information
matmerr authored Aug 27, 2024
1 parent 6eab87c commit 9751894
Show file tree
Hide file tree
Showing 10 changed files with 165 additions and 13 deletions.
6 changes: 5 additions & 1 deletion captureworkload/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,18 @@ func main() {
l.Info("Version: ", zap.String("version", buildinfo.Version))

var tel telemetry.Telemetry
var err error
if buildinfo.ApplicationInsightsID != "" {
l.Info("telemetry enabled", zap.String("applicationInsightsID", buildinfo.ApplicationInsightsID))
telemetry.InitAppInsights(buildinfo.ApplicationInsightsID, buildinfo.Version)
defer telemetry.ShutdownAppInsights()
tel = telemetry.NewAppInsightsTelemetryClient("retina-capture", map[string]string{
tel, err = telemetry.NewAppInsightsTelemetryClient("retina-capture", map[string]string{
"version": buildinfo.Version,
telemetry.PropertyApiserver: os.Getenv(captureConstants.ApiserverEnvKey),
})
if err != nil {
log.Logger().Panic("failed to create telemetry client", zap.Error(err))
}
} else {
tel = telemetry.NewNoopTelemetry()
}
Expand Down
6 changes: 5 additions & 1 deletion cmd/legacy/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,15 @@ func (d *Daemon) Start() error {
panic("telemetry enabled, but ApplicationInsightsID is empty")
}
mainLogger.Info("telemetry enabled", zap.String("applicationInsightsID", buildinfo.ApplicationInsightsID))
tel = telemetry.NewAppInsightsTelemetryClient("retina-agent", map[string]string{
tel, err = telemetry.NewAppInsightsTelemetryClient("retina-agent", map[string]string{
"version": buildinfo.Version,
"apiserver": cfg.Host,
"plugins": strings.Join(daemonConfig.EnabledPlugin, `,`),
})
if err != nil {
mainLogger.Error("failed to create telemetry client", zap.Error(err))
return fmt.Errorf("error when creating telemetry client: %w", err)
}
} else {
mainLogger.Info("telemetry disabled")
tel = telemetry.NewNoopTelemetry()
Expand Down
6 changes: 5 additions & 1 deletion operator/cmd/legacy/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,11 @@ func (o *Operator) Start() {
"version": buildinfo.Version,
telemetry.PropertyApiserver: apiserverURL,
}
tel = telemetry.NewAppInsightsTelemetryClient("retina-operator", properties)
tel, err = telemetry.NewAppInsightsTelemetryClient("retina-operator", properties)
if err != nil {
mainLogger.Error("failed to create telemetry client", zap.Error(err))
os.Exit(1)
}
} else {
mainLogger.Info("telemetry disabled", zap.String("apiserver", apiserverURL))
tel = telemetry.NewNoopTelemetry()
Expand Down
6 changes: 5 additions & 1 deletion pkg/shared/telemetry/cell_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package telemetry

import (
"context"
"fmt"
"strings"
"time"

Expand Down Expand Up @@ -58,7 +59,10 @@ var (
properties["plugins"] = strings.Join(p.Config.EnabledPlugins, `,`)
}

tel := telemetry.NewAppInsightsTelemetryClient(p.Config.Component, properties)
tel, err := telemetry.NewAppInsightsTelemetryClient(p.Config.Component, properties)
if err != nil {
return nil, fmt.Errorf("failed to create telemetry client: %w", err)
}
return tel, nil
}

Expand Down
15 changes: 15 additions & 0 deletions pkg/telemetry/noop_perf.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package telemetry

type NoopPerfProfile struct{}

func (n *NoopPerfProfile) GetMemoryUsage() map[string]string {
return make(map[string]string)
}

func NewNoopPerfProfile() *NoopPerfProfile {
return &NoopPerfProfile{}
}

func (n *NoopPerfProfile) GetCPUUsage() (map[string]string, error) { //nolint unnamed results are fine
return make(map[string]string), nil
}
6 changes: 6 additions & 0 deletions pkg/telemetry/perf.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package telemetry

type Perf interface {
GetMemoryUsage() map[string]string
GetCPUUsage() (map[string]string, error)
}
78 changes: 78 additions & 0 deletions pkg/telemetry/perf_unix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
//go:build unix

package telemetry

import (
"errors"
"fmt"
"runtime"
"strconv"
"sync"
"syscall"
)

var (
microsecondBitShift = 20
ErrNotInitialized = errors.New("perf profile not initialized")
)

const (
userCPUSeconds = "usr_cpu_sec"
sysCPUSeconds = "sys_cpu_sec"
)

type PerfProfile struct {
perflock sync.RWMutex
usage *syscall.Rusage
}

func NewPerfProfile() (*PerfProfile, error) {
p := &PerfProfile{}
var usage syscall.Rusage
err := syscall.Getrusage(syscall.RUSAGE_SELF, &usage)
if err != nil {
return nil, fmt.Errorf("failed to get rusage during init: %w", err)
}
p.usage = &usage

return p, nil
}

func (p *PerfProfile) GetMemoryUsage() map[string]string {
var m runtime.MemStats
runtime.ReadMemStats(&m)
props := map[string]string{
allocatedmem: strconv.FormatUint(bToMb(m.Alloc), 10),
sysmem: strconv.FormatUint(bToMb(m.Sys), 10),
goroutines: strconv.Itoa(runtime.NumGoroutine()),
}
return props
}

func (p *PerfProfile) GetCPUUsage() (map[string]string, error) { //nolint unnamed results are fine
props := make(map[string]string)
if p.usage == nil {
return props, ErrNotInitialized
}

p.perflock.Lock()
defer p.perflock.Unlock()
var currentUsage syscall.Rusage
err := syscall.Getrusage(syscall.RUSAGE_SELF, &currentUsage)
if err != nil {
return props, fmt.Errorf("failed to get rusage: %w", err)
}

userTime := (currentUsage.Utime.Sec - p.usage.Utime.Sec)
userTime += int64(currentUsage.Utime.Usec-p.usage.Utime.Usec) >> microsecondBitShift

sysTime := currentUsage.Stime.Sec - p.usage.Stime.Sec
sysTime += int64(currentUsage.Stime.Usec-p.usage.Stime.Usec) >> microsecondBitShift

p.usage = &currentUsage

props[userCPUSeconds] = strconv.FormatInt(userTime, 10)
props[sysCPUSeconds] = strconv.FormatInt(sysTime, 10)

return props, nil
}
27 changes: 27 additions & 0 deletions pkg/telemetry/perf_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package telemetry

import (
"runtime"
"strconv"
)

type PerfProfile struct{}

func (p *PerfProfile) GetMemoryUsage() map[string]string {
var m runtime.MemStats
runtime.ReadMemStats(&m)
props := map[string]string{
allocatedmem: strconv.FormatUint(bToMb(m.Alloc), 10),
sysmem: strconv.FormatUint(bToMb(m.Sys), 10),
goroutines: strconv.Itoa(runtime.NumGoroutine()),
}
return props
}

func NewPerfProfile() (*PerfProfile, error) {
return &PerfProfile{}, nil
}

func (p *PerfProfile) GetCPUUsage() (map[string]string, error) { //nolint unnamed results are fine
return make(map[string]string), nil
}
26 changes: 17 additions & 9 deletions pkg/telemetry/telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ package telemetry
import (
"context"
"fmt"
"maps"
"os"
"runtime"
"runtime/debug"
"strconv"
"sync"
"time"

Expand Down Expand Up @@ -77,9 +77,10 @@ type TelemetryClient struct {
sync.RWMutex
processName string
properties map[string]string
profile Perf
}

func NewAppInsightsTelemetryClient(processName string, additionalproperties map[string]string) *TelemetryClient {
func NewAppInsightsTelemetryClient(processName string, additionalproperties map[string]string) (*TelemetryClient, error) {
if client == nil {
fmt.Println("appinsights client not initialized")
}
Expand All @@ -90,10 +91,16 @@ func NewAppInsightsTelemetryClient(processName string, additionalproperties map[
properties[k] = v
}

perfProfile, err := NewPerfProfile()
if err != nil {
return nil, err
}

return &TelemetryClient{
processName: processName,
properties: properties,
}
profile: perfProfile,
}, nil
}

// TrackPanic function sends the stacktrace and flushes logs only in a goroutine where its call is deferred.
Expand Down Expand Up @@ -142,7 +149,7 @@ func GetEnvironmentProperties() map[string]string {
}

func (t *TelemetryClient) trackWarning(err error, msg string) {
t.TrackTrace(err.Error(), contracts.Warning, GetEnvironmentProperties())
t.TrackTrace(msg+": "+err.Error(), contracts.Warning, GetEnvironmentProperties())
}

func (t *TelemetryClient) heartbeat(ctx context.Context) {
Expand All @@ -151,15 +158,16 @@ func (t *TelemetryClient) heartbeat(ctx context.Context) {
t.trackWarning(err, "failed to get kernel version")
}

var m runtime.MemStats
runtime.ReadMemStats(&m)
props := map[string]string{
kernelversion: kernelVersion,
allocatedmem: strconv.FormatUint(bToMb(m.Alloc), 10),
sysmem: strconv.FormatUint(bToMb(m.Sys), 10),
goroutines: strconv.Itoa(runtime.NumGoroutine()),
}

cpuProps, err := t.profile.GetCPUUsage()
if err != nil {
t.trackWarning(err, "failed to get cpu usage")
}
maps.Copy(props, cpuProps)
maps.Copy(props, t.profile.GetMemoryUsage())
t.TrackEvent("heartbeat", props)
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/telemetry/telemetry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ func TestHeartbeat(t *testing.T) {
tr := &TelemetryClient{
RWMutex: sync.RWMutex{},
properties: tt.fields.properties,
profile: NewNoopPerfProfile(),
}
tr.heartbeat(tt.args.ctx)
})
Expand Down Expand Up @@ -125,6 +126,7 @@ func TestTelemetryClient_StopPerf(t *testing.T) {
tr := &TelemetryClient{
RWMutex: sync.RWMutex{},
properties: tt.fields.properties,
profile: NewNoopPerfProfile(),
}
tr.StopPerf(tt.args.counter)
})
Expand Down

0 comments on commit 9751894

Please sign in to comment.