Skip to content

Commit

Permalink
Merge branch 'main' into architecture-docs
Browse files Browse the repository at this point in the history
  • Loading branch information
kamilprz authored Jan 23, 2025
2 parents bfa83cc + 387a905 commit 7aa0b22
Show file tree
Hide file tree
Showing 32 changed files with 654 additions and 357 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/release-charts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,14 @@ jobs:
cat helm_push_result.txt
cosign sign --yes ghcr.io/${{ github.repository }}/charts/retina@$(tail -n 1 helm_push_result.txt | awk '{ print $2 }')
- name: Build, Push and Sign Hubble chart
id: build_hubble_chart
shell: bash
run: |
set -euo pipefail
export TAG=$(make version)
helm package ./deploy/hubble/manifests/controller/helm/retina --version $TAG
# Get Helm chart's SHA digest from helm push cmd output
helm push retina-hubble-$TAG.tgz oci://ghcr.io/${{ github.repository }}/charts >> helm_push_hubble.txt 2>&1
cat helm_push_hubble.txt
cosign sign --yes ghcr.io/${{ github.repository }}/charts/retina-hubble@$(tail -n 1 helm_push_hubble.txt | awk '{ print $2 }')
5 changes: 3 additions & 2 deletions .github/workflows/scale-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,12 @@ jobs:
NUM_REPLICAS: ${{ inputs.num_replicas }}
NUM_NETPOLS: ${{ inputs.num_netpol }}
CLEANUP: ${{ inputs.cleanup }}
IMAGE_REGISTRY: ${{ inputs.image_namespace == '' && vars.ACR_NAME || inputs.image_namespace }}
IMAGE_REGISTRY: ${{ vars.ACR_NAME }}
IMAGE_NAMESPACE: ${{ github.repository }}
TAG: ${{ inputs.image_tag }}
AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }}
shell: bash
run: |
set -euo pipefail
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -image-tag=$( [[ $TAG == "" ]] && make version || echo $TAG ) -create-infra=false -delete-infra=false
[[ $TAG == "" ]] && TAG=$(make version)
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=false -delete-infra=false
5 changes: 4 additions & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
pull_request:
branches: [main]
workflow_dispatch:

permissions:
actions: read
contents: read
Expand All @@ -15,6 +16,7 @@ permissions:
pull-requests: write
security-events: write
issues: write

jobs:
test-image:
runs-on: ubuntu-latest
Expand All @@ -32,8 +34,9 @@ jobs:
PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }}
run: |
make test-image IMAGE_NAMESPACE=${{ github.repository }} PLATFORM=linux/amd64
- name: Upload Artifacts
uses: actions/upload-artifact@v4
with:
name: coverage-files
path: ./coverage*
path: ./artifacts/coverage*
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,5 @@ image-metadata-*.json
*results*.json
netperf-*.json
netperf-*.csv

.certs/
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ifndef TAG
TAG ?= $(shell git describe --tags --always)
endif
OUTPUT_DIR = $(REPO_ROOT)/output
ARTIFACTS_DIR = $(REPO_ROOT)/artifacts
BUILD_DIR = $(OUTPUT_DIR)/$(GOOS)_$(GOARCH)
RETINA_BUILD_DIR = $(BUILD_DIR)/retina
RETINA_DIR = $(REPO_ROOT)/controller
Expand Down Expand Up @@ -241,6 +242,7 @@ container-docker: buildx # util target to build container images using docker bu
image_metadata_filename="image-metadata-$$image_name-$(TAG).json"; \
touch $$image_metadata_filename; \
echo "Building $$image_name for $$os/$$arch "; \
mkdir -p $(ARTIFACTS_DIR); \
docker buildx build \
--platform $(PLATFORM) \
--metadata-file=$$image_metadata_filename \
Expand All @@ -253,6 +255,7 @@ container-docker: buildx # util target to build container images using docker bu
--build-arg VERSION=$(VERSION) $(EXTRA_BUILD_ARGS) \
--target=$(TARGET) \
-t $(IMAGE_REGISTRY)/$(IMAGE):$(TAG) \
--output type=local,dest=$(ARTIFACTS_DIR) \
$(BUILDX_ACTION) \
$(CONTEXT_DIR)

Expand Down Expand Up @@ -549,6 +552,9 @@ get-certs:
hubble config set tls true
hubble config set tls-server-name instance.hubble-relay.cilium.io

# Replaces every '.' in $(1) with '\.'
escape_dot = $(subst .,\.,$(1))

.PHONY: clean-certs
clean-certs:
rm -rf $(CERT_DIR)
Expand Down
2 changes: 1 addition & 1 deletion deploy/hubble/manifests/controller/helm/retina/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
apiVersion: v2
name: retina
name: retina-hubble
description: A Helm chart for Retina Network Observability in Kubernetes with dependencies

# A chart can be either an 'application' or a 'library' chart.
Expand Down
44 changes: 39 additions & 5 deletions docs/02-Installation/01-Setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,19 @@ Note: you can also run captures with just the [CLI](./02-CLI.md).

## Installation

Requires Helm version >= v3.8.0.
### Requirements

- Helm version >= v3.8.0.

### Control Plane and Modes

The installation of Retina can be configured using different control planes and modes.

You can choose between the "legacy" control plane (the original implementation of Retina) and Hubble.

If the "legacy" control plane is chosen, different modes are available. The available metric dimensions depend on the selected mode. For an explanation of the available modes, see [Legacy Metric Modes](../03-Metrics/modes/modes.md).

Modes are not applicable to the Hubble control plane. For metrics related to the Hubble control plane, refer to the [Hubble metrics](../03-Metrics/02-hubble_metrics.md) documentation.

### Basic Mode

Expand Down Expand Up @@ -42,8 +54,6 @@ helm upgrade --install retina oci://ghcr.io/microsoft/retina/charts/retina \

### Advanced Mode with Remote Context (with Capture support)

See [Metric Modes](../03-Metrics/modes/modes.md).

```shell
VERSION=$( curl -sL https://api.github.com/repos/microsoft/retina/releases/latest | jq -r .name)
helm upgrade --install retina oci://ghcr.io/microsoft/retina/charts/retina \
Expand All @@ -64,8 +74,6 @@ helm upgrade --install retina oci://ghcr.io/microsoft/retina/charts/retina \

### Advanced Mode with Local Context (with Capture support)

See [Metric Modes](../03-Metrics/modes/modes.md).

```shell
VERSION=$( curl -sL https://api.github.com/repos/microsoft/retina/releases/latest | jq -r .name)
helm upgrade --install retina oci://ghcr.io/microsoft/retina/charts/retina \
Expand All @@ -84,6 +92,32 @@ helm upgrade --install retina oci://ghcr.io/microsoft/retina/charts/retina \
--set enableAnnotations=true
```

### Hubble control plane

```shell
VERSION=$( curl -sL https://api.github.com/repos/microsoft/retina/releases/latest | jq -r .name)
helm upgrade --install retina oci://ghcr.io/microsoft/retina/charts/retina-hubble \
--version $VERSION \
--namespace kube-system \
--set os.windows=true \
--set operator.enabled=true \
--set operator.repository=ghcr.io/microsoft/retina/retina-operator \
--set operator.tag=$VERSION \
--set agent.enabled=true \
--set agent.repository=ghcr.io/microsoft/retina/retina-agent \
--set agent.tag=$VERSION \
--set agent.init.enabled=true \
--set agent.init.repository=ghcr.io/microsoft/retina/retina-init \
--set agent.init.tag=$VERSION \
--set logLevel=info \
--set hubble.tls.enabled=false \
--set hubble.relay.tls.server.enabled=false \
--set hubble.tls.auto.enabled=false \
--set hubble.tls.auto.method=cronJob \
--set hubble.tls.auto.certValidityDuration=1 \
--set hubble.tls.auto.schedule="*/10 * * * *"
```

## Next Steps: Configuring Prometheus and Grafana

- [Prometheus](./04-prometheus.md)
Expand Down
59 changes: 47 additions & 12 deletions docs/02-Installation/03-Config.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,60 @@

## Overview

To customize metrics and other options, modify the `retina-config` ConfigMap. Default settings for each component are specified in *deploy/legacy/manifests/controller/helm/retina/values.yaml*.
### Default Configuration

## Agent Config
Default settings for each component are specified in [Values file](../../deploy/legacy/manifests/controller/helm/retina/values.yaml).

### Deployed Configuration

Configuration of an active Retina deployment can be seen in `retina-config` and `retina-operator-config` configmaps.

```shell
kubectl get configmap retina-config -n kube-system -o yaml
kubectl get configmap retina-operator-config -n kube-system -o yaml
```

### Updating Configuration

If the Retina installation was done via Helm, configuration updates should be done via `helm upgrade` defining the specific attribute name and value as part of the command.

The example below enables gathering of advance pod-level metrics.

```shell
VERSION=$( curl -sL https://api.github.com/repos/microsoft/retina/releases/latest | jq -r .name)
helm upgrade --install retina oci://ghcr.io/microsoft/retina/charts/retina \
--version $VERSION \
--namespace kube-system \
--set image.tag=$VERSION \
--set operator.tag=$VERSION \
--set logLevel=info \
--set enabledPlugin_linux="\[dropreason\,packetforward\,linuxutil\,dns\]"
--set enablePodLevel=true
```

## General Configuration

Apply to both Agent and Operator.

* `enableTelemetry`: Enables telemetry for the agent for managed AKS clusters. Requires `buildinfo.ApplicationInsightsID` to be set if enabled.
* `enablePodLevel`: Enables gathering of advanced pod-level metrics, attaching pods' metadata to Retina's metrics.
* `remoteContext`: Enables Retina to watch Pods on the cluster.
* `enableAnnotations`: Enables gathering of metrics for annotated resources. Resources can be annotated with `retina.sh=observe`. Requires the operator and `enableRetinaEndpoint` to be enabled.
* `enabledPlugin`: List of enabled plugins.

## Agent Configuration

* `logLevel`: Define the level of logs to store.
* `enabledPlugin_linux`: List of enabled plugins.
* `metricsInterval`: Interval for gathering metrics (in seconds). (@deprecated, use `metricsIntervalDuration` instead)
* `metricsIntervalDuration`: Interval for gathering metrics (in `time.Duration`).
* `enablePodLevel`: Enables gathering of advanced pod-level metrics, attaching pods' metadata to Retina's metrics.
* `enableConntrackMetrics`: Enables conntrack metrics for packets and bytes forwarded/received.
* `enableAnnotations`: Enables gathering of metrics for annotated resources. Resources can be annotated with `retina.sh=observe`. Requires the operator and `operator.enableRetinaEndpoint` to be enabled.
* `bypassLookupIPOfInterest`: If true, plugins like `packetparser` and `dropreason` will bypass IP lookup, generating an event for each packet regardless. `enableAnnotations` will not work if this is true.
* `dataAggregationLevel`: Defines the level of data aggregation for Retina. See [Data Aggregation](../05-Concepts/data-aggregation.md) for more details.

## Operator Config
## Operator Configuration

* `installCRDs`: Allows the operator to manage the installation of Retina-related CRDs.
* `enableTelemetry`: Enables telemetry for the operator in managed AKS clusters. Requires `buildinfo.ApplicationInsightsID` to be set if enabled.
* `captureDebug`: Toggles debug mode for captures. If true, the operator uses the image from the test container registry for the capture workload. Refer to *pkg/capture/utils/capture_image.go* for details on how the debug capture image version is selected.
* `captureJobNumLimit`: Sets the maximum number of jobs that can be created for each Capture.
* `enableRetinaEndpoint`: Allows the operator to monitor and update the cache with Pod metadata.
* `enableManagedStorageAccount`: Enables the use of a managed storage account for storing artifacts.
* `operator.installCRDs`: Allows the operator to manage the installation of Retina-related CRDs.
* `operator.enableRetinaEndpoint`: Allows the operator to monitor and update the cache with Pod metadata.
* `capture.captureDebug`: Toggles debug mode for captures. If true, the operator uses the image from the test container registry for the capture workload. Refer to [Capture Image file](../../pkg/capture/utils/capture_image.go) for details on how the debug capture image version is selected.
* `capture.captureJobNumLimit`: Sets the maximum number of jobs that can be created for each Capture.
* `capture.enableManagedStorageAccount`: Enables the use of a managed storage account for storing artifacts.
9 changes: 7 additions & 2 deletions docs/02-Installation/04-prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Prometheus is an open-source system monitoring and alerting toolkit originally b

1. Create a Kubernetes cluster.
2. Install Retina DaemonSet (see [Quick Installation](./01-Setup.md)).
3. Clone [Retina Repository](https://github.com/microsoft/retina) or download [Prometheus Values File](../../deploy/legacy/prometheus/values.yaml).

## Install Prometheus via Helm

Expand All @@ -19,13 +20,17 @@ Prometheus is an open-source system monitoring and alerting toolkit originally b
1. Install the Prometheus chart

```shell
helm install prometheus -n kube-system -f deploy/legacy/prometheus/values.yaml prometheus-community/kube-prometheus-stack
# The value of VALUE_FILE_PATH is relative to the repo root folder. Update this according to the location of your file.
VALUE_FILE_PATH=deploy/legacy/prometheus/values.yaml
helm install prometheus -n kube-system -f $VALUE_FILE_PATH prometheus-community/kube-prometheus-stack
```

Or if you already have the chart installed, upgrade how you see fit, providing the new job name as an additional scrape config, ex:

```shell
helm upgrade prometheus -n kube-system -f deploy/legacy/prometheus/values.yaml prometheus-community/kube-prometheus-stack
# The value of VALUE_FILE_PATH is relative to the repo root folder. Update this according to the location of your file.
VALUE_FILE_PATH=deploy/legacy/prometheus/values.yaml
helm upgrade prometheus -n kube-system -f $VALUE_FILE_PATH prometheus-community/kube-prometheus-stack
```

> Note: Grafana and kube-state metrics may schedule on Windows nodes, the current chart doesn't have node affinity for those components. Some manual intervention may be required.
Expand Down
34 changes: 17 additions & 17 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ require (
github.com/spf13/cobra v1.8.1
go.uber.org/zap v1.27.0
k8s.io/client-go v0.30.3
sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.4.9
sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.4.12
sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.20
)

Expand Down Expand Up @@ -37,7 +37,7 @@ require (
github.com/Azure/go-autorest/autorest/date v0.3.0 // indirect
github.com/Azure/go-autorest/logger v0.2.1 // indirect
github.com/Azure/go-autorest/tracing v0.6.0 // indirect
github.com/AzureAD/microsoft-authentication-library-for-go v1.3.1 // indirect
github.com/AzureAD/microsoft-authentication-library-for-go v1.3.2 // indirect
github.com/BurntSushi/toml v1.3.2 // indirect
github.com/MakeNowJust/heredoc v1.0.0 // indirect
github.com/Masterminds/goutils v1.1.1 // indirect
Expand All @@ -52,14 +52,14 @@ require (
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.28 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.28 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 // indirect
github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.27 // indirect
github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.28 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.1 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.8 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.5.2 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.9 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.8 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.24.10 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.9 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.33.8 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.9 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.24.11 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.10 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.33.9 // indirect
github.com/aws/smithy-go v1.22.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
Expand Down Expand Up @@ -262,7 +262,7 @@ require (
golang.org/x/sync v0.10.0
golang.org/x/sys v0.29.0
golang.org/x/term v0.28.0 // indirect
google.golang.org/protobuf v1.36.1
google.golang.org/protobuf v1.36.3
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/api v0.30.3
Expand All @@ -279,7 +279,7 @@ require (
github.com/Azure/azure-container-networking/zapai v0.0.3
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.17.0
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.8.0
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.8.1
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4 v4.8.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/dashboard/armdashboard v1.2.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor v0.11.0
Expand All @@ -290,9 +290,9 @@ require (
github.com/Microsoft/hcsshim v0.12.0-rc.3
github.com/Sytten/logrus-zap-hook v0.1.0
github.com/aws/aws-sdk-go-v2 v1.33.0
github.com/aws/aws-sdk-go-v2/config v1.28.11
github.com/aws/aws-sdk-go-v2/credentials v1.17.53
github.com/aws/aws-sdk-go-v2/service/s3 v1.72.2
github.com/aws/aws-sdk-go-v2/config v1.29.1
github.com/aws/aws-sdk-go-v2/credentials v1.17.54
github.com/aws/aws-sdk-go-v2/service/s3 v1.73.2
github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5
github.com/cilium/cilium v1.16.0-pre.1.0.20240403152809-b9853ecbcaeb
github.com/cilium/ebpf v0.16.0
Expand All @@ -313,14 +313,14 @@ require (
github.com/onsi/gomega v1.36.2
github.com/pkg/errors v0.9.1
github.com/prometheus/client_model v0.6.1
github.com/prometheus/common v0.61.0
github.com/prometheus/common v0.62.0
github.com/safchain/ethtool v0.5.9
github.com/sirupsen/logrus v1.9.3
github.com/spf13/viper v1.19.0
github.com/vishvananda/netlink v1.2.1-beta.2.0.20240524165444-4d4ba1473f21
go.opentelemetry.io/otel v1.33.0
go.opentelemetry.io/otel/metric v1.33.0
go.opentelemetry.io/otel/trace v1.33.0
go.opentelemetry.io/otel v1.34.0
go.opentelemetry.io/otel/metric v1.34.0
go.opentelemetry.io/otel/trace v1.34.0
go.uber.org/mock v0.5.0
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56
google.golang.org/grpc v1.66.2
Expand Down
Loading

0 comments on commit 7aa0b22

Please sign in to comment.