diff --git a/.github/workflows/scale-test.yaml b/.github/workflows/scale-test.yaml index 36a70fe84d..6ce87b25a8 100644 --- a/.github/workflows/scale-test.yaml +++ b/.github/workflows/scale-test.yaml @@ -96,11 +96,12 @@ jobs: NUM_REPLICAS: ${{ inputs.num_replicas }} NUM_NETPOLS: ${{ inputs.num_netpol }} CLEANUP: ${{ inputs.cleanup }} - IMAGE_REGISTRY: ${{ inputs.image_namespace == '' && vars.ACR_NAME || inputs.image_namespace }} + IMAGE_REGISTRY: ${{ vars.ACR_NAME }} IMAGE_NAMESPACE: ${{ github.repository }} TAG: ${{ inputs.image_tag }} AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }} shell: bash run: | set -euo pipefail - go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -image-tag=$( [[ $TAG == "" ]] && make version || echo $TAG ) -create-infra=false -delete-infra=false + [[ $TAG == "" ]] && TAG=$(make version) + go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=false -delete-infra=false diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4aa930117b..e751018cfc 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -7,6 +7,7 @@ on: pull_request: branches: [main] workflow_dispatch: + permissions: actions: read contents: read @@ -15,6 +16,7 @@ permissions: pull-requests: write security-events: write issues: write + jobs: test-image: runs-on: ubuntu-latest @@ -32,8 +34,9 @@ jobs: PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }} run: | make test-image IMAGE_NAMESPACE=${{ github.repository }} PLATFORM=linux/amd64 + - name: Upload Artifacts uses: actions/upload-artifact@v4 with: name: coverage-files - path: ./coverage* + path: ./output/coverage* diff --git a/.gitignore b/.gitignore index 026134d26a..32c777c697 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,5 @@ image-metadata-*.json *results*.json netperf-*.json netperf-*.csv + +.certs/ diff --git a/Makefile b/Makefile index 50d3d19cae..190c04f098 100644 --- a/Makefile +++ b/Makefile @@ -241,6 +241,7 @@ container-docker: buildx # util target to build container images using docker bu image_metadata_filename="image-metadata-$$image_name-$(TAG).json"; \ touch $$image_metadata_filename; \ echo "Building $$image_name for $$os/$$arch "; \ + mkdir -p $(OUTPUT_DIR); \ docker buildx build \ --platform $(PLATFORM) \ --metadata-file=$$image_metadata_filename \ @@ -253,6 +254,7 @@ container-docker: buildx # util target to build container images using docker bu --build-arg VERSION=$(VERSION) $(EXTRA_BUILD_ARGS) \ --target=$(TARGET) \ -t $(IMAGE_REGISTRY)/$(IMAGE):$(TAG) \ + --output type=local,dest=$(OUTPUT_DIR) \ $(BUILDX_ACTION) \ $(CONTEXT_DIR) @@ -549,6 +551,9 @@ get-certs: hubble config set tls true hubble config set tls-server-name instance.hubble-relay.cilium.io +# Replaces every '.' in $(1) with '\.' +escape_dot = $(subst .,\.,$(1)) + .PHONY: clean-certs clean-certs: rm -rf $(CERT_DIR) diff --git a/docs/02-Installation/01-Setup.md b/docs/02-Installation/01-Setup.md index 34caeb1eca..5b436d21ad 100644 --- a/docs/02-Installation/01-Setup.md +++ b/docs/02-Installation/01-Setup.md @@ -6,7 +6,9 @@ Note: you can also run captures with just the [CLI](./02-CLI.md). ## Installation -Requires Helm version >= v3.8.0. +### Requirements + +- Helm version >= v3.8.0. ### Basic Mode diff --git a/docs/02-Installation/03-Config.md b/docs/02-Installation/03-Config.md index 23c893fd3c..799d25ad1b 100644 --- a/docs/02-Installation/03-Config.md +++ b/docs/02-Installation/03-Config.md @@ -2,25 +2,60 @@ ## Overview -To customize metrics and other options, modify the `retina-config` ConfigMap. Default settings for each component are specified in *deploy/legacy/manifests/controller/helm/retina/values.yaml*. +### Default Configuration -## Agent Config +Default settings for each component are specified in [Values file](../../deploy/legacy/manifests/controller/helm/retina/values.yaml). + +### Deployed Configuration + +Configuration of an active Retina deployment can be seen in `retina-config` and `retina-operator-config` configmaps. + +```shell +kubectl get configmap retina-config -n kube-system -o yaml +kubectl get configmap retina-operator-config -n kube-system -o yaml +``` + +### Updating Configuration + +If the Retina installation was done via Helm, configuration updates should be done via `helm upgrade` defining the specific attribute name and value as part of the command. + +The example below enables gathering of advance pod-level metrics. + +```shell +VERSION=$( curl -sL https://api.github.com/repos/microsoft/retina/releases/latest | jq -r .name) +helm upgrade --install retina oci://ghcr.io/microsoft/retina/charts/retina \ + --version $VERSION \ + --namespace kube-system \ + --set image.tag=$VERSION \ + --set operator.tag=$VERSION \ + --set logLevel=info \ + --set enabledPlugin_linux="\[dropreason\,packetforward\,linuxutil\,dns\]" + --set enablePodLevel=true +``` + +## General Configuration + +Apply to both Agent and Operator. * `enableTelemetry`: Enables telemetry for the agent for managed AKS clusters. Requires `buildinfo.ApplicationInsightsID` to be set if enabled. -* `enablePodLevel`: Enables gathering of advanced pod-level metrics, attaching pods' metadata to Retina's metrics. * `remoteContext`: Enables Retina to watch Pods on the cluster. -* `enableAnnotations`: Enables gathering of metrics for annotated resources. Resources can be annotated with `retina.sh=observe`. Requires the operator and `enableRetinaEndpoint` to be enabled. -* `enabledPlugin`: List of enabled plugins. + +## Agent Configuration + +* `logLevel`: Define the level of logs to store. +* `enabledPlugin_linux`: List of enabled plugins. * `metricsInterval`: Interval for gathering metrics (in seconds). (@deprecated, use `metricsIntervalDuration` instead) * `metricsIntervalDuration`: Interval for gathering metrics (in `time.Duration`). +* `enablePodLevel`: Enables gathering of advanced pod-level metrics, attaching pods' metadata to Retina's metrics. +* `enableConntrackMetrics`: Enables conntrack metrics for packets and bytes forwarded/received. +* `enableAnnotations`: Enables gathering of metrics for annotated resources. Resources can be annotated with `retina.sh=observe`. Requires the operator and `operator.enableRetinaEndpoint` to be enabled. * `bypassLookupIPOfInterest`: If true, plugins like `packetparser` and `dropreason` will bypass IP lookup, generating an event for each packet regardless. `enableAnnotations` will not work if this is true. * `dataAggregationLevel`: Defines the level of data aggregation for Retina. See [Data Aggregation](../05-Concepts/data-aggregation.md) for more details. -## Operator Config +## Operator Configuration -* `installCRDs`: Allows the operator to manage the installation of Retina-related CRDs. -* `enableTelemetry`: Enables telemetry for the operator in managed AKS clusters. Requires `buildinfo.ApplicationInsightsID` to be set if enabled. -* `captureDebug`: Toggles debug mode for captures. If true, the operator uses the image from the test container registry for the capture workload. Refer to *pkg/capture/utils/capture_image.go* for details on how the debug capture image version is selected. -* `captureJobNumLimit`: Sets the maximum number of jobs that can be created for each Capture. -* `enableRetinaEndpoint`: Allows the operator to monitor and update the cache with Pod metadata. -* `enableManagedStorageAccount`: Enables the use of a managed storage account for storing artifacts. +* `operator.installCRDs`: Allows the operator to manage the installation of Retina-related CRDs. +* `operator.enableRetinaEndpoint`: Allows the operator to monitor and update the cache with Pod metadata. +* `capture.captureDebug`: Toggles debug mode for captures. If true, the operator uses the image from the test container registry for the capture workload. Refer to [Capture Image file](../../pkg/capture/utils/capture_image.go) for details on how the debug capture image version is selected. +* `capture.captureJobNumLimit`: Sets the maximum number of jobs that can be created for each Capture. +* `capture.enableManagedStorageAccount`: Enables the use of a managed storage account for storing artifacts. diff --git a/docs/02-Installation/04-prometheus.md b/docs/02-Installation/04-prometheus.md index 781c610289..bebabb90b6 100644 --- a/docs/02-Installation/04-prometheus.md +++ b/docs/02-Installation/04-prometheus.md @@ -6,6 +6,7 @@ Prometheus is an open-source system monitoring and alerting toolkit originally b 1. Create a Kubernetes cluster. 2. Install Retina DaemonSet (see [Quick Installation](./01-Setup.md)). +3. Clone [Retina Repository](https://github.com/microsoft/retina) or download [Prometheus Values File](../../deploy/legacy/prometheus/values.yaml). ## Install Prometheus via Helm @@ -19,13 +20,17 @@ Prometheus is an open-source system monitoring and alerting toolkit originally b 1. Install the Prometheus chart ```shell - helm install prometheus -n kube-system -f deploy/legacy/prometheus/values.yaml prometheus-community/kube-prometheus-stack + # The value of VALUE_FILE_PATH is relative to the repo root folder. Update this according to the location of your file. + VALUE_FILE_PATH=deploy/legacy/prometheus/values.yaml + helm install prometheus -n kube-system -f $VALUE_FILE_PATH prometheus-community/kube-prometheus-stack ``` Or if you already have the chart installed, upgrade how you see fit, providing the new job name as an additional scrape config, ex: ```shell - helm upgrade prometheus -n kube-system -f deploy/legacy/prometheus/values.yaml prometheus-community/kube-prometheus-stack + # The value of VALUE_FILE_PATH is relative to the repo root folder. Update this according to the location of your file. + VALUE_FILE_PATH=deploy/legacy/prometheus/values.yaml + helm upgrade prometheus -n kube-system -f $VALUE_FILE_PATH prometheus-community/kube-prometheus-stack ``` > Note: Grafana and kube-state metrics may schedule on Windows nodes, the current chart doesn't have node affinity for those components. Some manual intervention may be required. diff --git a/go.mod b/go.mod index fb0bc2a7d1..aa54766e00 100644 --- a/go.mod +++ b/go.mod @@ -37,7 +37,7 @@ require ( github.com/Azure/go-autorest/autorest/date v0.3.0 // indirect github.com/Azure/go-autorest/logger v0.2.1 // indirect github.com/Azure/go-autorest/tracing v0.6.0 // indirect - github.com/AzureAD/microsoft-authentication-library-for-go v1.3.1 // indirect + github.com/AzureAD/microsoft-authentication-library-for-go v1.3.2 // indirect github.com/BurntSushi/toml v1.3.2 // indirect github.com/MakeNowJust/heredoc v1.0.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect @@ -48,18 +48,18 @@ require ( github.com/armon/go-metrics v0.4.1 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.7 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.23 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.27 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.27 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.24 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.28 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.28 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 // indirect - github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.27 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.28 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.1 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.8 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.8 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.8 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.24.9 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.8 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.33.6 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.5.0 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.9 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.9 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.24.10 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.9 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.33.8 // indirect github.com/aws/smithy-go v1.22.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect @@ -262,7 +262,7 @@ require ( golang.org/x/sync v0.10.0 golang.org/x/sys v0.29.0 golang.org/x/term v0.28.0 // indirect - google.golang.org/protobuf v1.36.1 + google.golang.org/protobuf v1.36.3 gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/api v0.30.3 @@ -279,7 +279,7 @@ require ( github.com/Azure/azure-container-networking/zapai v0.0.3 github.com/Azure/azure-sdk-for-go v68.0.0+incompatible github.com/Azure/azure-sdk-for-go/sdk/azcore v1.17.0 - github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.8.0 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.8.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4 v4.8.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/dashboard/armdashboard v1.2.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor v0.11.0 @@ -289,10 +289,10 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.5.0 github.com/Microsoft/hcsshim v0.12.0-rc.3 github.com/Sytten/logrus-zap-hook v0.1.0 - github.com/aws/aws-sdk-go-v2 v1.32.8 - github.com/aws/aws-sdk-go-v2/config v1.28.7 - github.com/aws/aws-sdk-go-v2/credentials v1.17.51 - github.com/aws/aws-sdk-go-v2/service/s3 v1.72.2 + github.com/aws/aws-sdk-go-v2 v1.33.0 + github.com/aws/aws-sdk-go-v2/config v1.29.0 + github.com/aws/aws-sdk-go-v2/credentials v1.17.53 + github.com/aws/aws-sdk-go-v2/service/s3 v1.73.0 github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5 github.com/cilium/cilium v1.16.0-pre.1.0.20240403152809-b9853ecbcaeb github.com/cilium/ebpf v0.16.0 diff --git a/go.sum b/go.sum index 58a6824915..198a78c413 100644 --- a/go.sum +++ b/go.sum @@ -15,10 +15,10 @@ github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0 github.com/Azure/azure-sdk-for-go v68.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.17.0 h1:g0EZJwz7xkXQiZAI5xi9f3WWFYBlX1CPTrR+NDToRkQ= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.17.0/go.mod h1:XCW7KnZet0Opnr7HccfUw1PLc4CjHqpcaxW8DHklNkQ= -github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.8.0 h1:B/dfvscEQtew9dVuoxqxrUKKv8Ih2f55PydknDamU+g= -github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.8.0/go.mod h1:fiPSssYvltE08HJchL04dOy+RD4hgrjph0cwGGMntdI= -github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.0 h1:+m0M/LFxN43KvULkDNfdXOgrjtg6UYJPFBJyuEcRCAw= -github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.0/go.mod h1:PwOyop78lveYMRs6oCxjiVyBdyCgIYH6XHIVZO9/SFQ= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.8.1 h1:1mvYtZfWQAnwNah/C+Z+Jb9rQH95LPE2vlmMuWAHJk8= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.8.1/go.mod h1:75I/mXtme1JyWFtz8GocPHVFyH421IBoZErnO16dd0k= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.1 h1:Bk5uOhSAenHyR5P61D/NzeQCv+4fEVV8mOkJ82NqpWw= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.1/go.mod h1:QZ4pw3or1WPmRBxf0cHd1tknzrT54WPBOQoGutCPvSU= github.com/Azure/azure-sdk-for-go/sdk/internal v1.10.0 h1:ywEEhmNahHBihViHepv3xPBn1663uRv2t2q/ESv9seY= github.com/Azure/azure-sdk-for-go/sdk/internal v1.10.0/go.mod h1:iZDifYGJTIgIIkYRNWPENUnqx6bJ2xnSDFI2tjwZNuY= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 h1:Hp+EScFOu9HeCbeW8WU2yQPJd4gGwhMgKxWe+G6jNzw= @@ -87,8 +87,8 @@ github.com/Azure/perf-tests/network/benchmarks/netperf v0.0.0-20241008140716-395 github.com/Azure/perf-tests/network/benchmarks/netperf v0.0.0-20241008140716-395a79947d2c/go.mod h1:jeV6A8q9uDVDwffTt5KBk+5g7bXfpEImYW6qLKn0E+I= github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= -github.com/AzureAD/microsoft-authentication-library-for-go v1.3.1 h1:gUDtaZk8heteyfdmv+pcfHvhR9llnh7c7GMwZ8RVG04= -github.com/AzureAD/microsoft-authentication-library-for-go v1.3.1/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI= +github.com/AzureAD/microsoft-authentication-library-for-go v1.3.2 h1:kYRSnvJju5gYVyhkij+RTJ/VR6QIUaCfWeaFm2ycsjQ= +github.com/AzureAD/microsoft-authentication-library-for-go v1.3.2/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/toml v1.3.2 h1:o7IhLm0Msx3BaB+n3Ag7L8EVlByGnpq14C4YWiu/gL8= github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= @@ -129,40 +129,40 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= -github.com/aws/aws-sdk-go-v2 v1.32.8 h1:cZV+NUS/eGxKXMtmyhtYPJ7Z4YLoI/V8bkTdRZfYhGo= -github.com/aws/aws-sdk-go-v2 v1.32.8/go.mod h1:P5WJBrYqqbWVaOxgH0X/FYYD47/nooaPOZPlQdmiN2U= +github.com/aws/aws-sdk-go-v2 v1.33.0 h1:Evgm4DI9imD81V0WwD+TN4DCwjUMdc94TrduMLbgZJs= +github.com/aws/aws-sdk-go-v2 v1.33.0/go.mod h1:P5WJBrYqqbWVaOxgH0X/FYYD47/nooaPOZPlQdmiN2U= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.7 h1:lL7IfaFzngfx0ZwUGOZdsFFnQ5uLvR0hWqqhyE7Q9M8= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.7/go.mod h1:QraP0UcVlQJsmHfioCrveWOC1nbiWUl3ej08h4mXWoc= -github.com/aws/aws-sdk-go-v2/config v1.28.7 h1:GduUnoTXlhkgnxTD93g1nv4tVPILbdNQOzav+Wpg7AE= -github.com/aws/aws-sdk-go-v2/config v1.28.7/go.mod h1:vZGX6GVkIE8uECSUHB6MWAUsd4ZcG2Yq/dMa4refR3M= -github.com/aws/aws-sdk-go-v2/credentials v1.17.51 h1:F/9Sm6Y6k4LqDesZDPJCLxQGXNNHd/ZtJiWd0lCZKRk= -github.com/aws/aws-sdk-go-v2/credentials v1.17.51/go.mod h1:TKbzCHm43AoPyA+iLGGcruXd4AFhF8tOmLex2R9jWNQ= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.23 h1:IBAoD/1d8A8/1aA8g4MBVtTRHhXRiNAgwdbo/xRM2DI= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.23/go.mod h1:vfENuCM7dofkgKpYzuzf1VT1UKkA/YL3qanfBn7HCaA= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.27 h1:jSJjSBzw8VDIbWv+mmvBSP8ezsztMYJGH+eKqi9AmNs= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.27/go.mod h1:/DAhLbFRgwhmvJdOfSm+WwikZrCuUJiA4WgJG0fTNSw= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.27 h1:l+X4K77Dui85pIj5foXDhPlnqcNRG2QUyvca300lXh8= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.27/go.mod h1:KvZXSFEXm6x84yE8qffKvT3x8J5clWnVFXphpohhzJ8= +github.com/aws/aws-sdk-go-v2/config v1.29.0 h1:Vk/u4jof33or1qAQLdofpjKV7mQQT7DcUpnYx8kdmxY= +github.com/aws/aws-sdk-go-v2/config v1.29.0/go.mod h1:iXAZK3Gxvpq3tA+B9WaDYpZis7M8KFgdrDPMmHrgbJM= +github.com/aws/aws-sdk-go-v2/credentials v1.17.53 h1:lwrVhiEDW5yXsuVKlFVUnR2R50zt2DklhOyeLETqDuE= +github.com/aws/aws-sdk-go-v2/credentials v1.17.53/go.mod h1:CkqM1bIw/xjEpBMhBnvqUXYZbpCFuj6dnCAyDk2AtAY= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.24 h1:5grmdTdMsovn9kPZPI23Hhvp0ZyNm5cRO+IZFIYiAfw= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.24/go.mod h1:zqi7TVKTswH3Ozq28PkmBmgzG1tona7mo9G2IJg4Cis= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.28 h1:igORFSiH3bfq4lxKFkTSYDhJEUCYo6C8VKiWJjYwQuQ= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.28/go.mod h1:3So8EA/aAYm36L7XIvCVwLa0s5N0P7o2b1oqnx/2R4g= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.28 h1:1mOW9zAUMhTSrMDssEHS/ajx8JcAj/IcftzcmNlmVLI= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.28/go.mod h1:kGlXVIWDfvt2Ox5zEaNglmq0hXPHgQFNMix33Tw22jA= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 h1:VaRN3TlFdd6KxX1x3ILT5ynH6HvKgqdiXoTxAF4HQcQ= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1/go.mod h1:FbtygfRFze9usAadmnGJNc8KsP346kEe+y2/oyhGAGc= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.27 h1:AmB5QxnD+fBFrg9LcqzkgF/CaYvMyU/BTlejG4t1S7Q= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.27/go.mod h1:Sai7P3xTiyv9ZUYO3IFxMnmiIP759/67iQbU4kdmkyU= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.28 h1:7kpeALOUeThs2kEjlAxlADAVfxKmkYAedlpZ3kdoSJ4= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.28/go.mod h1:pyaOYEdp1MJWgtXLy6q80r3DhsVdOIOZNB9hdTcJIvI= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.1 h1:iXtILhvDxB6kPvEXgsDhGaZCSC6LQET5ZHSdJozeI0Y= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.1/go.mod h1:9nu0fVANtYiAePIBh2/pFUSwtJ402hLnp854CNoDOeE= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.8 h1:iwYS40JnrBeA9e9aI5S6KKN4EB2zR4iUVYN0nwVivz4= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.8/go.mod h1:Fm9Mi+ApqmFiknZtGpohVcBGvpTu542VC4XO9YudRi0= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.8 h1:cWno7lefSH6Pp+mSznagKCgfDGeZRin66UvYUqAkyeA= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.8/go.mod h1:tPD+VjU3ABTBoEJ3nctu5Nyg4P4yjqSH5bJGGkY4+XE= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.8 h1:/Mn7gTedG86nbpjT4QEKsN1D/fThiYe1qvq7WsBGNHg= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.8/go.mod h1:Ae3va9LPmvjj231ukHB6UeT8nS7wTPfC3tMZSZMwNYg= -github.com/aws/aws-sdk-go-v2/service/s3 v1.72.2 h1:a7aQ3RW+ug4IbhoQp29NZdc7vqrzKZZfWZSaQAXOZvQ= -github.com/aws/aws-sdk-go-v2/service/s3 v1.72.2/go.mod h1:xMekrnhmJ5aqmyxtmALs7mlvXw5xRh+eYjOjvrIIFJ4= -github.com/aws/aws-sdk-go-v2/service/sso v1.24.9 h1:YqtxripbjWb2QLyzRK9pByfEDvgg95gpC2AyDq4hFE8= -github.com/aws/aws-sdk-go-v2/service/sso v1.24.9/go.mod h1:lV8iQpg6OLOfBnqbGMBKYjilBlf633qwHnBEiMSPoHY= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.8 h1:6dBT1Lz8fK11m22R+AqfRsFn8320K0T5DTGxxOQBSMw= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.8/go.mod h1:/kiBvRQXBc6xeJTYzhSdGvJ5vm1tjaDEjH+MSeRJnlY= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.6 h1:VwhTrsTuVn52an4mXx29PqRzs2Dvu921NpGk7y43tAM= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.6/go.mod h1:+8h7PZb3yY5ftmVLD7ocEoE98hdc8PoKS0H3wfx1dlc= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.5.0 h1:pC19SLXdHsfXTvCwy3sHfiACXaSjRkKlOQYnaTk8loI= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.5.0/go.mod h1:dIW8puxSbYLSPv/ju0d9A3CpwXdtqvJtYKDMVmPLOWE= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.9 h1:TQmKDyETFGiXVhZfQ/I0cCFziqqX58pi4tKJGYGFSz0= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.9/go.mod h1:HVLPK2iHQBUx7HfZeOQSEu3v2ubZaAY2YPbAm5/WUyY= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.9 h1:2aInXbh02XsbO0KobPGMNXyv2QP73VDKsWPNJARj/+4= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.9/go.mod h1:dgXS1i+HgWnYkPXqNoPIPKeUsUUYHaUbThC90aDnNiE= +github.com/aws/aws-sdk-go-v2/service/s3 v1.73.0 h1:sHF4brL/726nbTldh8GGDKFS5LsQ8FwOTKEyvKp9DB4= +github.com/aws/aws-sdk-go-v2/service/s3 v1.73.0/go.mod h1:rGHXqEgGFrz7j58tIGKKAfD1fJzYXeKkN/Jn3eIRZYE= +github.com/aws/aws-sdk-go-v2/service/sso v1.24.10 h1:DyZUj3xSw3FR3TXSwDhPhuZkkT14QHBiacdbUVcD0Dg= +github.com/aws/aws-sdk-go-v2/service/sso v1.24.10/go.mod h1:Ro744S4fKiCCuZECXgOi760TiYylUM8ZBf6OGiZzJtY= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.9 h1:I1TsPEs34vbpOnR81GIcAq4/3Ud+jRHVGwx6qLQUHLs= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.9/go.mod h1:Fzsj6lZEb8AkTE5S68OhcbBqeWPsR8RnGuKPr8Todl8= +github.com/aws/aws-sdk-go-v2/service/sts v1.33.8 h1:pqEJQtlKWvnv3B6VRt60ZmsHy3SotlEBvfUBPB1KVcM= +github.com/aws/aws-sdk-go-v2/service/sts v1.33.8/go.mod h1:f6vjfZER1M17Fokn0IzssOTMT2N8ZSq+7jnNF0tArvw= github.com/aws/smithy-go v1.22.1 h1:/HPHZQ0g7f4eUeK6HKglFz8uwVfZKgoI25rb/J+dnro= github.com/aws/smithy-go v1.22.1/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= @@ -779,8 +779,8 @@ github.com/prometheus/procfs v0.0.3/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDa github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= -github.com/redis/go-redis/v9 v9.6.1 h1:HHDteefn6ZkTtY5fGUE8tj8uy85AHk6zP7CpzIAM0y4= -github.com/redis/go-redis/v9 v9.6.1/go.mod h1:0C0c6ycQsdpVNQpxb1njEQIqkx5UcsM8FJCQLgE9+RA= +github.com/redis/go-redis/v9 v9.7.0 h1:HhLSs+B6O021gwzl+locl0zEDnyNkxMtf/Z3NNBMa9E= +github.com/redis/go-redis/v9 v9.7.0/go.mod h1:f6zhXITC7JUJIlPEiBOTXxJgPLdZcA93GewI7inzyWw= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= @@ -1153,8 +1153,8 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -google.golang.org/protobuf v1.36.1 h1:yBPeRvTftaleIgM3PZ/WBIZ7XM/eEYAaEyCwvyjq/gk= -google.golang.org/protobuf v1.36.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.3 h1:82DV7MYdb8anAVi3qge1wSnMDrnKK7ebr+I0hHRN1BU= +google.golang.org/protobuf v1.36.3/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/test/e2e/common/common.go b/test/e2e/common/common.go index 9772320685..2aa29f05dd 100644 --- a/test/e2e/common/common.go +++ b/test/e2e/common/common.go @@ -22,6 +22,7 @@ const ( KubeSystemNamespace = "kube-system" TestPodNamespace = "kube-system-test" AzureAppInsightsKeyEnv = "AZURE_APP_INSIGHTS_KEY" + OutputFilePathEnv = "OUTPUT_FILEPATH" ) var ( diff --git a/test/e2e/framework/kubernetes/check-pod-status.go b/test/e2e/framework/kubernetes/check-pod-status.go index 27405031bb..197b32c964 100644 --- a/test/e2e/framework/kubernetes/check-pod-status.go +++ b/test/e2e/framework/kubernetes/check-pod-status.go @@ -14,8 +14,9 @@ import ( ) const ( - RetryTimeoutPodsReady = 5 * time.Minute - RetryIntervalPodsReady = 5 * time.Second + RetryTimeoutPodsReady = 5 * time.Minute + RetryIntervalPodsReady = 5 * time.Second + timeoutWaitForPodsSeconds = 1200 printInterval = 5 // print to stdout every 5 iterations ) @@ -48,7 +49,7 @@ func (w *WaitPodsReady) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), timeoutWaitForPodsSeconds*time.Second) defer cancel() return WaitForPodReady(ctx, clientset, w.Namespace, w.LabelSelector) @@ -60,7 +61,6 @@ func (w *WaitPodsReady) Stop() error { } func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) error { - podReadyMap := make(map[string]bool) printIterator := 0 conditionFunc := wait.ConditionWithContextFunc(func(context.Context) (bool, error) { @@ -78,34 +78,25 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names return false, nil } - // check each indviidual pod to see if it's in Running state + // check each individual pod to see if it's in Running state for i := range podList.Items { - var pod *corev1.Pod - pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, podList.Items[i].Name, metav1.GetOptions{}) - if err != nil { - return false, fmt.Errorf("error getting Pod: %w", err) - } // Check the Pod phase - if pod.Status.Phase != corev1.PodRunning { + if podList.Items[i].Status.Phase != corev1.PodRunning { if printIterator%printInterval == 0 { - log.Printf("pod \"%s\" is not in Running state yet. Waiting...\n", pod.Name) + log.Printf("pod \"%s\" is not in Running state yet. Waiting...\n", podList.Items[i].Name) } return false, nil } // Check all container status. - for _, containerStatus := range pod.Status.ContainerStatuses { - if !containerStatus.Ready { - log.Printf("container \"%s\" in pod \"%s\" is not ready yet. Waiting...\n", containerStatus.Name, pod.Name) + for j := range podList.Items[i].Status.ContainerStatuses { + if !podList.Items[i].Status.ContainerStatuses[j].Ready { + log.Printf("container \"%s\" in pod \"%s\" is not ready yet. Waiting...\n", podList.Items[i].Status.ContainerStatuses[j].Name, podList.Items[i].Name) return false, nil } } - if !podReadyMap[pod.Name] { - log.Printf("pod \"%s\" is in Running state\n", pod.Name) - podReadyMap[pod.Name] = true - } } log.Printf("all pods in namespace \"%s\" with label \"%s\" are in Running state\n", namespace, labelSelector) return true, nil diff --git a/test/e2e/framework/kubernetes/create-kapinger-deployment.go b/test/e2e/framework/kubernetes/create-kapinger-deployment.go index 06862e1c09..a895625e32 100644 --- a/test/e2e/framework/kubernetes/create-kapinger-deployment.go +++ b/test/e2e/framework/kubernetes/create-kapinger-deployment.go @@ -138,7 +138,7 @@ func (c *CreateKapingerDeployment) GetKapingerDeployment() *appsv1.Deployment { "memory": resource.MustParse("20Mi"), }, Limits: v1.ResourceList{ - "memory": resource.MustParse("20Mi"), + "memory": resource.MustParse("100Mi"), }, }, Ports: []v1.ContainerPort{ diff --git a/test/e2e/framework/kubernetes/delete-namespace.go b/test/e2e/framework/kubernetes/delete-namespace.go index c5fa3dbc66..a8bd41c3ef 100644 --- a/test/e2e/framework/kubernetes/delete-namespace.go +++ b/test/e2e/framework/kubernetes/delete-namespace.go @@ -30,7 +30,7 @@ func (d *DeleteNamespace) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 1200*time.Second) defer cancel() err = clientset.CoreV1().Namespaces().Delete(ctx, d.Namespace, metaV1.DeleteOptions{}) @@ -41,7 +41,7 @@ func (d *DeleteNamespace) Run() error { } backoff := wait.Backoff{ - Steps: 6, + Steps: 9, Duration: 10 * time.Second, Factor: 2.0, // Jitter: 0.1, diff --git a/test/e2e/framework/kubernetes/install-retina-helm.go b/test/e2e/framework/kubernetes/install-retina-helm.go index 7f1828f17c..ba74d64eac 100644 --- a/test/e2e/framework/kubernetes/install-retina-helm.go +++ b/test/e2e/framework/kubernetes/install-retina-helm.go @@ -91,6 +91,7 @@ func (i *InstallHelmChart) Run() error { chart.Values["image"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-agent" chart.Values["image"].(map[string]interface{})["initRepository"] = imageRegistry + "/" + imageNamespace + "/retina-init" chart.Values["operator"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-operator" + chart.Values["operator"].(map[string]interface{})["enabled"] = true getclient := action.NewGet(actionConfig) release, err := getclient.Run(i.ReleaseName) diff --git a/test/e2e/framework/scaletest/add-shared-labels.go b/test/e2e/framework/scaletest/add-shared-labels.go index d76139c0be..6a38be4f5d 100644 --- a/test/e2e/framework/scaletest/add-shared-labels.go +++ b/test/e2e/framework/scaletest/add-shared-labels.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "log" "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -50,32 +51,21 @@ func (a *AddSharedLabelsToAllPods) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := contextToLabelAllPods() defer cancel() resources, err := clientset.CoreV1().Pods(a.Namespace).List(ctx, metav1.ListOptions{}) - patch := []patchStringValue{} - - for i := 0; i < a.NumSharedLabelsPerPod; i++ { - patch = append(patch, patchStringValue{ - Op: "add", - Path: "/metadata/labels/shared-lab-" + fmt.Sprintf("%05d", i), - Value: "val", - }) - } - - patchBytes, err := json.Marshal(patch) + patchBytes, err := getSharedLabelsPatch(a.NumSharedLabelsPerPod) if err != nil { - return fmt.Errorf("error marshalling patch: %w", err) + return fmt.Errorf("error getting label patch: %w", err) } for _, resource := range resources.Items { - clientset.CoreV1().Pods(a.Namespace).Patch(ctx, resource.Name, - types.JSONPatchType, - patchBytes, - metav1.PatchOptions{}, - ) + err = patchLabel(ctx, clientset, a.Namespace, resource.Name, patchBytes) + if err != nil { + log.Printf("Error adding shared labels to pod %s: %s\n", resource.Name, err) + } } return nil @@ -85,3 +75,38 @@ func (a *AddSharedLabelsToAllPods) Run() error { func (a *AddSharedLabelsToAllPods) Stop() error { return nil } + +func patchLabel(ctx context.Context, clientset *kubernetes.Clientset, namespace, podName string, patchBytes []byte) error { + log.Println("Labeling Pod", podName) + _, err := clientset.CoreV1().Pods(namespace).Patch(ctx, podName, + types.JSONPatchType, + patchBytes, + metav1.PatchOptions{}, + ) + if err != nil { + return fmt.Errorf("failed to patch pod: %w", err) + } + + return nil +} + +func getSharedLabelsPatch(numLabels int) ([]byte, error) { + patch := []patchStringValue{} + for i := 0; i < numLabels; i++ { + patch = append(patch, patchStringValue{ + Op: "add", + Path: "/metadata/labels/shared-lab-" + fmt.Sprintf("%05d", i), + Value: "val", + }) + } + b, err := json.Marshal(patch) + if err != nil { + return nil, fmt.Errorf("error marshalling patch: %w", err) + } + + return b, nil +} + +func contextToLabelAllPods() (context.Context, context.CancelFunc) { + return context.WithTimeout(context.Background(), 120*time.Minute) +} diff --git a/test/e2e/framework/scaletest/add-unique-labels.go b/test/e2e/framework/scaletest/add-unique-labels.go index cfdd458c82..ff85764d8f 100644 --- a/test/e2e/framework/scaletest/add-unique-labels.go +++ b/test/e2e/framework/scaletest/add-unique-labels.go @@ -1,13 +1,10 @@ package scaletest import ( - "context" "encoding/json" "fmt" - "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" ) @@ -44,7 +41,7 @@ func (a *AddUniqueLabelsToAllPods) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := contextToLabelAllPods() defer cancel() resources, err := clientset.CoreV1().Pods(a.Namespace).List(ctx, metav1.ListOptions{}) @@ -53,7 +50,6 @@ func (a *AddUniqueLabelsToAllPods) Run() error { for _, resource := range resources.Items { patch := []patchStringValue{} - for i := 0; i < a.NumUniqueLabelsPerPod; i++ { patch = append(patch, patchStringValue{ Op: "add", @@ -65,14 +61,13 @@ func (a *AddUniqueLabelsToAllPods) Run() error { patchBytes, err := json.Marshal(patch) if err != nil { - return fmt.Errorf("error marshalling patch: %w", err) + return fmt.Errorf("failed to marshal patch: %w", err) } - clientset.CoreV1().Pods(a.Namespace).Patch(ctx, resource.Name, - types.JSONPatchType, - patchBytes, - metav1.PatchOptions{}, - ) + err = patchLabel(ctx, clientset, a.Namespace, resource.Name, patchBytes) + if err != nil { + return fmt.Errorf("error adding unique label to pod: %w", err) + } } return nil diff --git a/test/e2e/framework/scaletest/create-resources.go b/test/e2e/framework/scaletest/create-resources.go index 688ab57747..4057cdc826 100644 --- a/test/e2e/framework/scaletest/create-resources.go +++ b/test/e2e/framework/scaletest/create-resources.go @@ -7,6 +7,7 @@ import ( "time" e2ekubernetes "github.com/microsoft/retina/test/e2e/framework/kubernetes" + "github.com/microsoft/retina/test/retry" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" @@ -48,11 +49,18 @@ func (c *CreateResources) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 1200*time.Second) defer cancel() + retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} + for _, resource := range resources { - e2ekubernetes.CreateResource(ctx, resource, clientset) + err := retrier.Do(ctx, func() error { + return e2ekubernetes.CreateResource(ctx, resource, clientset) + }) + if err != nil { + return fmt.Errorf("error creating resource: %w", err) + } } return nil @@ -71,12 +79,6 @@ func (c *CreateResources) getResources() []runtime.Object { // kwokDeployments := c.generateDeployments(c.NumKwokDeployments, c.NumKwokReplicas, "kwok") // objs = append(objs, kwokDeployments...) - realDeployments := c.generateDeployments() - objs = append(objs, realDeployments...) - - services := c.generateServices("real") - objs = append(objs, services...) - kapinger := e2ekubernetes.CreateKapingerDeployment{ KapingerNamespace: c.Namespace, KubeConfigFilePath: c.KubeConfigFilePath, @@ -88,6 +90,13 @@ func (c *CreateResources) getResources() []runtime.Object { kapingerSA := kapinger.GetKapingerServiceAccount() objs = append(objs, kapingerClusterRole, kapingerClusterRoleBinding, kapingerSA) + + realDeployments := c.generateDeployments() + objs = append(objs, realDeployments...) + + services := c.generateServices() + objs = append(objs, services...) + // c.generateKwokNodes() log.Println("Finished generating YAMLs") return objs @@ -118,6 +127,8 @@ func (c *CreateResources) generateDeployments() []runtime.Object { labelPrefix := fmt.Sprintf("%s-dep-lab", name) deployment.Name = name + deployment.Labels["name"] = name + deployment.Spec.Template.Labels["name"] = name r := int32(c.NumRealReplicas) deployment.Spec.Replicas = &r @@ -135,7 +146,7 @@ func (c *CreateResources) generateDeployments() []runtime.Object { return objs } -func (c *CreateResources) generateServices(svcKind string) []runtime.Object { +func (c *CreateResources) generateServices() []runtime.Object { objs := []runtime.Object{} kapingerSvc := e2ekubernetes.CreateKapingerDeployment{ @@ -146,10 +157,10 @@ func (c *CreateResources) generateServices(svcKind string) []runtime.Object { for i := 0; i < c.NumRealServices; i++ { template := kapingerSvc.GetKapingerService() - name := fmt.Sprintf("%s-svc-%05d", svcKind, i) + name := fmt.Sprintf("%s-svc-%05d", c.RealPodType, i) template.Name = name - template.Spec.Selector["name"] = fmt.Sprintf("%s-%s-dep-%05d", svcKind, c.RealPodType, i) + template.Spec.Selector["name"] = fmt.Sprintf("%s-dep-%05d", c.RealPodType, i) objs = append(objs, template) } diff --git a/test/e2e/framework/scaletest/delete-and-re-add-labels.go b/test/e2e/framework/scaletest/delete-and-re-add-labels.go index 5897b4d766..3403ea2488 100644 --- a/test/e2e/framework/scaletest/delete-and-re-add-labels.go +++ b/test/e2e/framework/scaletest/delete-and-re-add-labels.go @@ -48,7 +48,7 @@ func (d *DeleteAndReAddLabels) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := contextToLabelAllPods() defer cancel() labelsToDelete := `"shared-lab-00000": null, "shared-lab-00001": null, "shared-lab-00002": null` @@ -91,6 +91,7 @@ func (d *DeleteAndReAddLabels) Run() error { func (d *DeleteAndReAddLabels) addLabels(ctx context.Context, clientset *kubernetes.Clientset, pods *corev1.PodList, patch string) error { for _, pod := range pods.Items { + log.Println("Labeling Pod", pod.Name) _, err := clientset.CoreV1().Pods(d.Namespace).Patch(ctx, pod.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}) if err != nil { return fmt.Errorf("error patching pod: %w", err) @@ -103,6 +104,7 @@ func (d *DeleteAndReAddLabels) addLabels(ctx context.Context, clientset *kuberne func (d *DeleteAndReAddLabels) deleteLabels(ctx context.Context, clientset *kubernetes.Clientset, pods *corev1.PodList, patch string) error { for _, pod := range pods.Items { + log.Println("Deleting label from Pod", pod.Name) _, err := clientset.CoreV1().Pods(d.Namespace).Patch(ctx, pod.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}) if err != nil { return fmt.Errorf("error patching pod: %w", err) diff --git a/test/e2e/framework/scaletest/get-publish-metrics.go b/test/e2e/framework/scaletest/get-publish-metrics.go index 3495addf33..b8e168b239 100644 --- a/test/e2e/framework/scaletest/get-publish-metrics.go +++ b/test/e2e/framework/scaletest/get-publish-metrics.go @@ -6,29 +6,41 @@ import ( "fmt" "log" "os" - "sync" + "strconv" "time" "github.com/microsoft/retina/pkg/telemetry" "github.com/microsoft/retina/test/e2e/common" + "github.com/microsoft/retina/test/retry" "github.com/pkg/errors" + "golang.org/x/sync/errgroup" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" + v1beta1 "k8s.io/metrics/pkg/apis/metrics/v1beta1" metrics "k8s.io/metrics/pkg/client/clientset/versioned" ) +const ( + defaultRetryAttempts = 10 + defaultRetryDelay = 500 * time.Millisecond + defaultInterval = 2 * time.Minute +) + type GetAndPublishMetrics struct { KubeConfigFilePath string AdditionalTelemetryProperty map[string]string Labels map[string]string - OutputFilePath string + outputFilePath string stop chan struct{} - wg sync.WaitGroup + errs *errgroup.Group telemetryClient *telemetry.TelemetryClient appInsightsKey string + k8sClient *kubernetes.Clientset + metricsClient *metrics.Clientset } func (g *GetAndPublishMetrics) Run() error { @@ -43,12 +55,36 @@ func (g *GetAndPublishMetrics) Run() error { g.telemetryClient = telemetryClient } + config, err := clientcmd.BuildConfigFromFlags("", g.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + k8sClient, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + g.k8sClient = k8sClient + + metricsClient, err := metrics.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating metrics client: %w", err) + } + g.metricsClient = metricsClient + g.stop = make(chan struct{}) - g.wg.Add(1) + g.errs = new(errgroup.Group) - go func() { + g.errs.Go(func() error { - t := time.NewTicker(5 * time.Minute) + t := time.NewTicker(defaultInterval) + defer t.Stop() + + // First execution + err := g.getAndPublishMetrics() + if err != nil { + return fmt.Errorf("failed to get and publish test metrics: %w", err) + } for { select { @@ -56,18 +92,16 @@ func (g *GetAndPublishMetrics) Run() error { case <-t.C: err := g.getAndPublishMetrics() if err != nil { - log.Fatalf("error getting and publishing number of restarts: %v", err) - return + return fmt.Errorf("failed to get and publish test metrics: %w", err) } case <-g.stop: - g.wg.Done() - return + return nil } } - }() + }) return nil } @@ -75,66 +109,74 @@ func (g *GetAndPublishMetrics) Run() error { func (g *GetAndPublishMetrics) Stop() error { telemetry.ShutdownAppInsights() close(g.stop) - g.wg.Wait() + if err := g.errs.Wait(); err != nil { + return err //nolint:wrapcheck // already wrapped in goroutine + } + return nil } func (g *GetAndPublishMetrics) Prevalidate() error { - if os.Getenv(common.AzureAppInsightsKeyEnv) == "" { + if g.appInsightsKey == "" { log.Println("env ", common.AzureAppInsightsKeyEnv, " not provided") } - g.appInsightsKey = os.Getenv(common.AzureAppInsightsKeyEnv) if _, ok := g.AdditionalTelemetryProperty["retinaVersion"]; !ok { return fmt.Errorf("retinaVersion is required in AdditionalTelemetryProperty") } + + if g.outputFilePath == "" { + log.Println("Output file path not provided. Metrics will not be written to file") + return nil + } + + log.Println("Output file path provided: ", g.outputFilePath) return nil } func (g *GetAndPublishMetrics) getAndPublishMetrics() error { - config, err := clientcmd.BuildConfigFromFlags("", g.KubeConfigFilePath) - if err != nil { - return fmt.Errorf("error building kubeconfig: %w", err) - } + ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + defer cancel() - clientset, err := kubernetes.NewForConfig(config) - if err != nil { - return fmt.Errorf("error creating Kubernetes client: %w", err) - } + labelSelector := labels.Set(g.Labels).String() - mc, err := metrics.NewForConfig(config) + agentsMetrics, err := g.getPodsMetrics(ctx, labelSelector) if err != nil { - return fmt.Errorf("error creating metrics client: %w", err) + log.Println("Error getting agents' metrics, will try again later:", err) + return nil } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) - defer cancel() - - metrics, err := g.getMetrics(ctx, clientset, mc) + operatorMetrics, err := g.getPodsMetrics(ctx, "app=retina-operator") if err != nil { - return fmt.Errorf("error getting metrics: %w", err) + log.Println("Error getting operator's metrics, will try again later:", err) + return nil } + allMetrics := []metric{} + allMetrics = append(allMetrics, agentsMetrics...) + allMetrics = append(allMetrics, operatorMetrics...) + // Publish metrics if g.telemetryClient != nil { log.Println("Publishing metrics to AppInsights") - for _, metric := range metrics { + for _, metric := range allMetrics { g.telemetryClient.TrackEvent("scale-test", metric) } } // Write metrics to file - if g.OutputFilePath != "" { - log.Println("Writing metrics to file ", g.OutputFilePath) - file, err := os.OpenFile(g.OutputFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if g.outputFilePath != "" { + log.Println("Writing metrics to file ", g.outputFilePath) + + file, err := os.OpenFile(g.outputFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) if err != nil { return fmt.Errorf("error writing to csv file: %w", err) } defer file.Close() - for _, m := range metrics { + for _, m := range allMetrics { b, err := json.Marshal(m) if err != nil { return fmt.Errorf("error marshalling metric: %w", err) @@ -150,45 +192,83 @@ func (g *GetAndPublishMetrics) getAndPublishMetrics() error { type metric map[string]string -func (g *GetAndPublishMetrics) getMetrics(ctx context.Context, k8sClient *kubernetes.Clientset, metricsClient *metrics.Clientset) ([]metric, error) { +func (g *GetAndPublishMetrics) getPodsMetrics(ctx context.Context, labelSelector string) ([]metric, error) { - labelSelector := labels.Set(g.Labels).String() + var pods *v1.PodList - pods, err := k8sClient.CoreV1().Pods(common.KubeSystemNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) + retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} + + err := retrier.Do(ctx, func() error { + var err error + pods, err = g.k8sClient.CoreV1().Pods(common.KubeSystemNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) + if err != nil { + return fmt.Errorf("error listing pods: %w", err) + } + return nil + }) if err != nil { - return nil, errors.Wrap(err, "error getting nodes") + return nil, errors.Wrap(err, "error getting pods") } - nodesMetricsInt := metricsClient.MetricsV1beta1().NodeMetricses() - podMetricsInt := metricsClient.MetricsV1beta1().PodMetricses(common.KubeSystemNamespace) + var nodeMetricsList *v1beta1.NodeMetricsList + err = retrier.Do(ctx, func() error { + nodeMetricsList, err = g.metricsClient.MetricsV1beta1().NodeMetricses().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("error listing node metrics: %w", err) + } + return nil + }) + if err != nil { + log.Println("Error getting node metrics:", err) + } + + var podMetricsList *v1beta1.PodMetricsList + err = retrier.Do(ctx, func() error { + podMetricsList, err = g.metricsClient.MetricsV1beta1().PodMetricses(common.KubeSystemNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) + if err != nil { + return fmt.Errorf("error listing pod metrics: %w", err) + } + return nil + }) + if err != nil { + log.Println("Error getting pod metrics:", err) + } var allPodsHealth []metric timestamp := time.Now().UTC().Format(time.RFC3339) + // List -> map for lookup + podMetrics := make(map[string]*v1beta1.PodMetrics) + for i := range podMetricsList.Items { + podMetrics[podMetricsList.Items[i].Name] = podMetricsList.Items[i].DeepCopy() + } + + // List -> map for lookup + nodeMetrics := make(map[string]*v1beta1.NodeMetrics) + for i := range nodeMetricsList.Items { + nodeMetrics[nodeMetricsList.Items[i].Name] = nodeMetricsList.Items[i].DeepCopy() + } + for _, pod := range pods.Items { var podHealth metric = make(map[string]string) - podMetrics, err := podMetricsInt.Get(ctx, pod.Name, metav1.GetOptions{}) - if err != nil { - return nil, errors.Wrap(err, "error getting pod metrics") - } - podMem := resource.MustParse("0") podCpu := resource.MustParse("0") - for _, cm := range podMetrics.Containers { - podMem.Add(cm.Usage["memory"]) - podCpu.Add(cm.Usage["cpu"]) + if podMetrics[pod.Name] != nil { + for _, cm := range podMetrics[pod.Name].Containers { + podMem.Add(cm.Usage["memory"]) + podCpu.Add(cm.Usage["cpu"]) + } } - nodeMetrics, err := nodesMetricsInt.Get(ctx, pod.Spec.NodeName, metav1.GetOptions{}) - if err != nil { - return nil, errors.Wrap(err, "error getting node metrics") + nodeMem := resource.MustParse("0") + nodeCPU := resource.MustParse("0") + if nodeMetrics[pod.Spec.NodeName] != nil { + nodeMem = nodeMetrics[pod.Spec.NodeName].Usage["memory"] + nodeCPU = nodeMetrics[pod.Spec.NodeName].Usage["cpu"] } - nodeMem := nodeMetrics.Usage["memory"] - nodeCpu := nodeMetrics.Usage["cpu"] - restarts := 0 for _, containerStatus := range pod.Status.ContainerStatuses { @@ -196,13 +276,14 @@ func (g *GetAndPublishMetrics) getMetrics(ctx context.Context, k8sClient *kubern } podHealth["timestamp"] = timestamp - podHealth["pod"] = pod.Name - podHealth["podCpuInMilliCore"] = fmt.Sprintf("%d", podCpu.MilliValue()) - podHealth["podMemoryInMB"] = fmt.Sprintf("%d", podMem.Value()/(1048576)) - podHealth["podRestarts"] = fmt.Sprintf("%d", restarts) - podHealth["node"] = pod.Spec.NodeName - podHealth["nodeCpuInMilliCore"] = fmt.Sprintf("%d", nodeCpu.MilliValue()) - podHealth["nodeMemoryInMB"] = fmt.Sprintf("%d", nodeMem.Value()/(1048576)) + podHealth["retinaPod"] = pod.Name + podHealth["podStatus"] = string(pod.Status.Phase) + podHealth["podCpuInMilliCore"] = strconv.FormatInt(podCpu.MilliValue(), 10) + podHealth["podMemoryInMB"] = strconv.FormatInt(podMem.Value()/(1048576), 10) + podHealth["podRestarts"] = strconv.FormatInt(int64(restarts), 10) + podHealth["retinaNode"] = pod.Spec.NodeName + podHealth["nodeCpuInMilliCore"] = strconv.FormatInt(nodeCPU.MilliValue(), 10) + podHealth["nodeMemoryInMB"] = strconv.FormatInt(nodeMem.Value()/(1048576), 10) allPodsHealth = append(allPodsHealth, podHealth) @@ -210,3 +291,13 @@ func (g *GetAndPublishMetrics) getMetrics(ctx context.Context, k8sClient *kubern return allPodsHealth, nil } + +func (g *GetAndPublishMetrics) SetAppInsightsKey(appInsightsKey string) *GetAndPublishMetrics { + g.appInsightsKey = appInsightsKey + return g +} + +func (g *GetAndPublishMetrics) SetOutputFilePath(outputFilePath string) *GetAndPublishMetrics { + g.outputFilePath = outputFilePath + return g +} diff --git a/test/e2e/framework/scaletest/options.go b/test/e2e/framework/scaletest/options.go index 6b5284422b..a7d27683b6 100644 --- a/test/e2e/framework/scaletest/options.go +++ b/test/e2e/framework/scaletest/options.go @@ -37,4 +37,5 @@ type Options struct { numRealPods int LabelsToGetMetrics map[string]string AdditionalTelemetryProperty map[string]string + CleanUp bool } diff --git a/test/e2e/jobs/scale.go b/test/e2e/jobs/scale.go index 89215785c1..58b5d49864 100644 --- a/test/e2e/jobs/scale.go +++ b/test/e2e/jobs/scale.go @@ -4,6 +4,7 @@ import ( "os" "time" + "github.com/microsoft/retina/test/e2e/common" "github.com/microsoft/retina/test/e2e/framework/kubernetes" "github.com/microsoft/retina/test/e2e/framework/scaletest" "github.com/microsoft/retina/test/e2e/framework/types" @@ -15,7 +16,7 @@ func DefaultScaleTestOptions() scaletest.Options { MaxKwokPodsPerNode: 0, NumKwokDeployments: 0, NumKwokReplicas: 0, - MaxRealPodsPerNode: 100, + MaxRealPodsPerNode: 250, NumRealDeployments: 1000, RealPodType: "kapinger", NumRealReplicas: 40, @@ -32,7 +33,7 @@ func DefaultScaleTestOptions() scaletest.Options { DeletePodsInterval: 60 * time.Second, DeleteRealPods: false, DeletePodsTimes: 1, - DeleteLabels: false, + DeleteLabels: true, DeleteLabelsInterval: 60 * time.Second, DeleteLabelsTimes: 1, DeleteNetworkPolicies: false, @@ -40,6 +41,7 @@ func DefaultScaleTestOptions() scaletest.Options { DeleteNetworkPoliciesTimes: 1, LabelsToGetMetrics: map[string]string{}, AdditionalTelemetryProperty: map[string]string{}, + CleanUp: true, } } @@ -63,14 +65,18 @@ func ScaleTest(opt *scaletest.Options) *types.Job { job.AddStep(&kubernetes.CreateNamespace{}, nil) - job.AddStep(&scaletest.GetAndPublishMetrics{ + // There's a known limitation on leaving empty fields in Steps. + // Set methods are used to set private fields and keep environment variables accessed within jobs, rather then spread through steps. + job.AddStep((&scaletest.GetAndPublishMetrics{ Labels: opt.LabelsToGetMetrics, AdditionalTelemetryProperty: opt.AdditionalTelemetryProperty, - OutputFilePath: os.Getenv("OUTPUT_FILEPATH"), - }, &types.StepOptions{ - SkipSavingParametersToJob: true, - RunInBackgroundWithID: "get-metrics", - }) + }). + SetOutputFilePath(os.Getenv(common.OutputFilePathEnv)). + SetAppInsightsKey(os.Getenv(common.AzureAppInsightsKeyEnv)), + &types.StepOptions{ + SkipSavingParametersToJob: true, + RunInBackgroundWithID: "get-metrics", + }) job.AddStep(&scaletest.CreateResources{ NumKwokDeployments: opt.NumKwokDeployments, @@ -111,7 +117,9 @@ func ScaleTest(opt *scaletest.Options) *types.Job { BackgroundID: "get-metrics", }, nil) - job.AddStep(&kubernetes.DeleteNamespace{}, nil) + if opt.CleanUp { + job.AddStep(&kubernetes.DeleteNamespace{}, nil) + } return job } diff --git a/test/e2e/scale_test.go b/test/e2e/scale_test.go index 6769dccc09..687d32ceb5 100644 --- a/test/e2e/scale_test.go +++ b/test/e2e/scale_test.go @@ -58,7 +58,7 @@ func TestE2ERetina_Scale(t *testing.T) { NumDeployments := os.Getenv("NUM_DEPLOYMENTS") NumReplicas := os.Getenv("NUM_REPLICAS") - NumNetworkPolicies := os.Getenv("NUM_NET_POL") + NumNetworkPolicies := os.Getenv("NUM_NETPOLS") CleanUp := os.Getenv("CLEANUP") if NumDeployments != "" { @@ -75,7 +75,7 @@ func TestE2ERetina_Scale(t *testing.T) { require.NoError(t, err) } if CleanUp != "" { - opt.DeleteLabels, err = strconv.ParseBool(CleanUp) + opt.CleanUp, err = strconv.ParseBool(CleanUp) require.NoError(t, err) } diff --git a/test/image/Dockerfile b/test/image/Dockerfile index 8c681a3fee..9d5dd44627 100644 --- a/test/image/Dockerfile +++ b/test/image/Dockerfile @@ -4,7 +4,7 @@ FROM mcr.microsoft.com/oss/go/microsoft/golang@sha256:88225e171f29fe5f1f6ffca8eb ENV CGO_ENABLED=0 COPY . /go/src/github.com/microsoft/retina WORKDIR /go/src/github.com/microsoft/retina -RUN tdnf install -y clang16 lld16 bpftool libbpf-devel make git +RUN tdnf install -y clang16 lld16 bpftool libbpf-devel make git jq RUN go generate /go/src/github.com/microsoft/retina/pkg/plugin/... # RUN go mod edit -module retina # RUN make all generate