Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: add test to measure rate of growth of metrics #1218

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions deploy/legacy/prometheus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,15 @@ prometheus:
- source_labels: [__name__]
action: keep
regex: (.*)
resources:
limits:
memory: 10Gi
enableAdminAPI: true
storageSpec:
volumeClaimTemplate:
spec:
# storageClassName: gp2
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 50Gi
4 changes: 3 additions & 1 deletion test/e2e/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ import (
)

const (
RetinaPort int = 10093
RetinaPort int = 10093
PrometheusPort int = 9090
// netObsRGtag is used to tag resources created by this test suite
NetObsRGtag = "-e2e-netobs-"
KubeSystemNamespace = "kube-system"
TestPodNamespace = "kube-system-test"
AzureAppInsightsKeyEnv = "AZURE_APP_INSIGHTS_KEY"
OutputFilePathEnv = "OUTPUT_FILEPATH"
)

var (
Expand Down
174 changes: 174 additions & 0 deletions test/e2e/framework/metrics/query-publish.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
package metrics

import (
"context"
"encoding/json"
"fmt"
"io/fs"
"log"
"os"
"sync"
"time"

"github.com/microsoft/retina/pkg/telemetry"
"github.com/microsoft/retina/test/e2e/common"
prom_client "github.com/prometheus/client_golang/api"
prom_v1 "github.com/prometheus/client_golang/api/prometheus/v1"
prom_model "github.com/prometheus/common/model"
)

type QueryAndPublish struct {
Query string
Endpoint string
AdditionalTelemetryProperty map[string]string
outputFilePath string
stop chan struct{}
wg sync.WaitGroup
telemetryClient *telemetry.TelemetryClient
appInsightsKey string
}

func (q *QueryAndPublish) Run() error {
if q.appInsightsKey != "" {
telemetry.InitAppInsights(q.appInsightsKey, q.AdditionalTelemetryProperty["retinaVersion"])

telemetryClient, err := telemetry.NewAppInsightsTelemetryClient("retina-rate-of-growth", q.AdditionalTelemetryProperty)
if err != nil {
return fmt.Errorf("error creating telemetry client: %w", err)
}

q.telemetryClient = telemetryClient
}

q.stop = make(chan struct{})
q.wg.Add(1)

go func() {

t := time.NewTicker(2 * time.Second)

// First execution
err := q.getAndPublishMetrics()
if err != nil {
log.Fatalf("error getting and publishing metrics: %v", err)
return
}

for {
select {

case <-t.C:
err := q.getAndPublishMetrics()
if err != nil {
log.Fatalf("error getting and publishing metrics: %v", err)
return
}

case <-q.stop:
q.wg.Done()
return

}
}

}()

return nil
}

func (q *QueryAndPublish) getAndPublishMetrics() error {
// ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)

Check failure on line 80 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

commentedOutCode: may want to remove commented-out code (gocritic)

Check failure on line 80 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

commentedOutCode: may want to remove commented-out code (gocritic)

Check failure on line 80 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

commentedOutCode: may want to remove commented-out code (gocritic)

Check failure on line 80 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

commentedOutCode: may want to remove commented-out code (gocritic)
// defer cancel()

client, err := prom_client.NewClient(prom_client.Config{
Address: q.Endpoint,
})
if err != nil {
return fmt.Errorf("error creating prometheus client: %w", err)
}

promApi := prom_v1.NewAPI(client)

Check failure on line 90 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

var-naming: var promApi should be promAPI (revive)

Check failure on line 90 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

var-naming: var promApi should be promAPI (revive)

Check failure on line 90 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

var-naming: var promApi should be promAPI (revive)

Check failure on line 90 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

var-naming: var promApi should be promAPI (revive)
ctx := context.TODO()

result, warnings, err := promApi.Query(ctx, q.Query, time.Now())
if err != nil {
return fmt.Errorf("error querying prometheus: %w", err)
}
if len(warnings) > 0 {
log.Println("query warnings: ", warnings)
}
type metrics map[string]string

allMetrics := []metrics{}

for _, sample := range result.(prom_model.Vector) {
instance := string(sample.Metric["instance"])
samplesScraped := sample.Value.String()

m := map[string]string{
"instance": instance,
"samplesScraped": samplesScraped,
}
allMetrics = append(allMetrics, m)
}

// Publish metrics
if q.telemetryClient != nil {
log.Println("Publishing metrics to AppInsights")
for _, metric := range allMetrics {
q.telemetryClient.TrackEvent("metrics-scraped", metric)

}
}

// Write metrics to file
if q.outputFilePath != "" {
log.Println("Writing metrics to file ", q.outputFilePath)

permissions := 0o644
file, err := os.OpenFile(q.outputFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, fs.FileMode(permissions))
if err != nil {
return fmt.Errorf("error writing to csv file: %w", err)
}
defer file.Close()

for _, m := range allMetrics {
b, err := json.Marshal(m)
if err != nil {
return fmt.Errorf("error marshalling metric: %w", err)
}
file.Write(b)

Check failure on line 140 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

Error return value of `file.Write` is not checked (errcheck)

Check failure on line 140 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

Error return value of `file.Write` is not checked (errcheck)

Check failure on line 140 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

Error return value of `file.Write` is not checked (errcheck)

Check failure on line 140 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

Error return value of `file.Write` is not checked (errcheck)
file.WriteString("\n")

Check failure on line 141 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

Error return value of `file.WriteString` is not checked (errcheck)

Check failure on line 141 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

Error return value of `file.WriteString` is not checked (errcheck)

Check failure on line 141 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

Error return value of `file.WriteString` is not checked (errcheck)

Check failure on line 141 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

Error return value of `file.WriteString` is not checked (errcheck)
}

}

return nil
}

func (q *QueryAndPublish) Stop() error {
telemetry.ShutdownAppInsights()
close(q.stop)
q.wg.Wait()
return nil
}

func (q *QueryAndPublish) Prevalidate() error {
if os.Getenv(common.AzureAppInsightsKeyEnv) == "" {
log.Println("env ", common.AzureAppInsightsKeyEnv, " not provided")
}
q.appInsightsKey = os.Getenv(common.AzureAppInsightsKeyEnv)

if _, ok := q.AdditionalTelemetryProperty["retinaVersion"]; !ok {
return fmt.Errorf("retinaVersion is required in AdditionalTelemetryProperty")

Check failure on line 163 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

do not define dynamic errors, use wrapped static errors instead: "fmt.Errorf(\"retinaVersion is required in AdditionalTelemetryProperty\")" (err113)

Check failure on line 163 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

do not define dynamic errors, use wrapped static errors instead: "fmt.Errorf(\"retinaVersion is required in AdditionalTelemetryProperty\")" (err113)

Check failure on line 163 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

do not define dynamic errors, use wrapped static errors instead: "fmt.Errorf(\"retinaVersion is required in AdditionalTelemetryProperty\")" (err113)

Check failure on line 163 in test/e2e/framework/metrics/query-publish.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

do not define dynamic errors, use wrapped static errors instead: "fmt.Errorf(\"retinaVersion is required in AdditionalTelemetryProperty\")" (err113)
}

if os.Getenv(common.OutputFilePathEnv) == "" {
log.Println("Output file path not provided. Metrics will not be written to file")
return nil
}
q.outputFilePath = os.Getenv(common.OutputFilePathEnv)

log.Println("Output file path provided: ", q.outputFilePath)
return nil
}
112 changes: 112 additions & 0 deletions test/e2e/growth_metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package retina

import (
"os"
"path/filepath"
"strconv"
"testing"
"time"

"github.com/microsoft/retina/test/e2e/common"
"github.com/microsoft/retina/test/e2e/framework/azure"
"github.com/microsoft/retina/test/e2e/framework/generic"
"github.com/microsoft/retina/test/e2e/framework/helpers"
"github.com/microsoft/retina/test/e2e/framework/kubernetes"
"github.com/microsoft/retina/test/e2e/framework/metrics"
"github.com/microsoft/retina/test/e2e/framework/types"
"github.com/stretchr/testify/require"
)

func GetKubeconfig(clusterName, subscriptionId, resourceGroup, kubeConfigFilePath string) *types.Job {

Check failure on line 20 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

var-naming: func parameter subscriptionId should be subscriptionID (revive)

Check failure on line 20 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

var-naming: func parameter subscriptionId should be subscriptionID (revive)

Check failure on line 20 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

var-naming: func parameter subscriptionId should be subscriptionID (revive)

Check failure on line 20 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

var-naming: func parameter subscriptionId should be subscriptionID (revive)
job := types.NewJob("Get kubeconfig")
job.AddStep(&azure.GetAKSKubeConfig{
ClusterName: clusterName,
SubscriptionID: subscriptionId,
ResourceGroupName: resourceGroup,
Location: "why?",
KubeConfigFilePath: kubeConfigFilePath,
}, nil)
return job
}

func GrowthTest(additionalTelemetryProperty map[string]string, kubeConfigFilePath string) *types.Job {
job := types.NewJob("Growth Test")
labelAffinity := "app.kubernetes.io/instance=prometheus-kube-prometheus-prometheus"
portForwardId := "port-forward"

Check failure on line 35 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

var-naming: var portForwardId should be portForwardID (revive)

Check failure on line 35 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

var-naming: var portForwardId should be portForwardID (revive)

Check failure on line 35 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

var-naming: var portForwardId should be portForwardID (revive)

Check failure on line 35 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

var-naming: var portForwardId should be portForwardID (revive)
metricsStepId := "metrics"

Check failure on line 36 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

var-naming: var metricsStepId should be metricsStepID (revive)

Check failure on line 36 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

var-naming: var metricsStepId should be metricsStepID (revive)

Check failure on line 36 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

var-naming: var metricsStepId should be metricsStepID (revive)

Check failure on line 36 in test/e2e/growth_metrics_test.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

var-naming: var metricsStepId should be metricsStepID (revive)

job.AddStep(&kubernetes.PortForward{
KubeConfigFilePath: kubeConfigFilePath,
Namespace: common.KubeSystemNamespace,
LabelSelector: "app.kubernetes.io/instance=prometheus-kube-prometheus-prometheus",
LocalPort: strconv.Itoa(common.PrometheusPort),
RemotePort: strconv.Itoa(common.PrometheusPort),
Endpoint: "metrics",
OptionalLabelAffinity: labelAffinity,
},
&types.StepOptions{
SkipSavingParametersToJob: true,
RunInBackgroundWithID: portForwardId,
})

job.AddStep(&metrics.QueryAndPublish{
Endpoint: "http://localhost:" + strconv.Itoa(common.PrometheusPort),
Query: "scrape_samples_scraped{job=\"retina-pods\"}",
AdditionalTelemetryProperty: additionalTelemetryProperty,
},
&types.StepOptions{
SkipSavingParametersToJob: true,
RunInBackgroundWithID: metricsStepId,
})

job.AddStep(&types.Sleep{
Duration: 60 * time.Second,
}, nil)

job.AddStep(
&types.Stop{
BackgroundID: metricsStepId,
}, nil)

job.AddStep(
&types.Stop{
BackgroundID: portForwardId,
}, nil)
return job
}

func Test_GrowthOfMetrics(t *testing.T) {
ctx, cancel := helpers.Context(t)
defer cancel()

clusterName := common.ClusterNameForE2ETest(t)

subID := os.Getenv("AZURE_SUBSCRIPTION_ID")
require.NotEmpty(t, subID)

rg := os.Getenv("AZURE_RESOURCE_GROUP")
if rg == "" {
// Use the cluster name as the resource group name by default.
rg = clusterName
}

RetinaVersion := os.Getenv(generic.DefaultTagEnv)
require.NotEmpty(t, RetinaVersion)

additionalTelemetryProperty := map[string]string{}
additionalTelemetryProperty["retinaVersion"] = RetinaVersion
additionalTelemetryProperty["clusterName"] = clusterName
additionalTelemetryProperty["resourceGroup"] = rg

cwd, err := os.Getwd()
require.NoError(t, err)

rootDir := filepath.Dir(filepath.Dir(cwd))
kubeConfigFilePath := filepath.Join(rootDir, "test", "e2e", "test.pem")

getKubeconfig := types.NewRunner(t, GetKubeconfig(clusterName, subID, rg, kubeConfigFilePath))
getKubeconfig.Run(ctx)

growth := types.NewRunner(t, GrowthTest(additionalTelemetryProperty, kubeConfigFilePath))
growth.Run(ctx)
}
Loading