From cd219ca30aa92d1f90b780dde04c94491b0ccf42 Mon Sep 17 00:00:00 2001 From: Beegii Khurelsukh <72892628+BeegiiK@users.noreply.github.com> Date: Fri, 3 Jan 2025 18:39:03 +0000 Subject: [PATCH] fix(bug): Ensure windows agent stability using hubble/legacy helm values (#1128) # Description This PR aims to fix the stability of the retina windows agent. There were 4 causes identified and each commit resolves one respectively. 1. Invalid rendering of the namespace helm value (1st commit) ``` matmerr@matmerr-cloud-dev: ~/go/src/github.com/Azure/telescope [06:56:29 PM][matmerr-aks-pktmon-11][matmerr/enable-ama]$ k logs -f retina-agent-win-7f7kb Starting Retina Agent starting Retina daemon with legacy control plane v0.0.17 2024/12/02 18:56:22 metricsInterval is deprecated, please use metricsIntervalDuration instead init client-go KUBECONFIG set, using kubeconfig: C:\hpc\kubeconfig Error: starting daemon: creating controller-runtime manager: error loading config file "C:\hpc\kubeconfig": yaml: invalid map key: map[interface {}]interface {}{".Values.namespace":interface {}(nil)} ``` 2. Default operator value is enabled and will cause RBAC issues for the windows agents (2nd commit) ``` ts=2024-12-10T16:58:48.634Z level=info caller=hnsstats/hnsstats_windows.go:212 msg="Start hnsstats plugin..." W1210 16:58:49.990792 7108 reflector.go:547] pkg/mod/k8s.io/client-go@v0.30.3/tools/cache/reflector.go:232: failed to list *v1alpha1.MetricsConfiguration: metricsconfigurations.retina.sh is forbidden: User "system:serviceaccount:kube-system:retina-agent" cannot list resource "metricsconfigurations" in API group "retina.sh" at the cluster scope ``` 3. Telemetry enabled also causes the agent to panic if application insights is not defined. User can change the config map as desired but default values should not cause the agent to crash (3rd commit) 4. `kubeconfig` file cannot be found for the legacy chart values. Executing the `setkubeconfigpath.ps1` was required for the container setup (4th commit). Update: It was later found that the missing `kubeconfig` error only exists on redeploy if the initial retina was before this change (https://github.com/microsoft/retina/pull/1118). A later GH issue was created - https://github.com/microsoft/retina/issues/1138 ``` beegii@bignamboi:~/src/retina$ k logs retina-agent-win-4tl7m -n kube-system Starting Retina Agent starting Retina daemon with legacy control plane v0.0.17 2024/12/11 18:40:15 metricsInterval is deprecated, please use metricsIntervalDuration instead init client-go KUBECONFIG set, using kubeconfig: C:\hpc\kubeconfig Error: starting daemon: creating controller-runtime manager: CreateFile C:\hpc\kubeconfig: The system cannot find the file specified. ``` ## Related Issue https://github.com/microsoft/retina/issues/1122 ## Checklist - [x] I have read the [contributing documentation](https://retina.sh/docs/contributing). - [x] I signed and signed-off the commits (`git commit -S -s ...`). See [this documentation](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification) on signing commits. - [x] I have correctly attributed the author(s) of the code. - [x] I have tested the changes locally. - [x] I have followed the project's style guidelines. - [x] I have updated the documentation, if necessary. - [x] I have added tests, if applicable. ## Screenshots (if applicable) or Testing Completed Each commit corresponding image was built and tested on the cluster to confirm each fix works! ![image](https://github.com/user-attachments/assets/dde7fe23-22ff-49bf-8c96-2c1a42c96f9d) ## Additional Notes First three problems were experienced when deploying retina using the hubble path and the last issue was experienced when deploying retina using the legacy path --- Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) file for more information on how to contribute to this project. --- .../controller/helm/retina/templates/agent/configmap.yaml | 2 +- .../hubble/manifests/controller/helm/retina/values.yaml | 2 +- .../controller/helm/retina/templates/daemonset.yaml | 8 +++++++- windows/kubeconfigtemplate.yaml | 2 +- windows/manifests/windows.yaml | 4 ++-- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/deploy/hubble/manifests/controller/helm/retina/templates/agent/configmap.yaml b/deploy/hubble/manifests/controller/helm/retina/templates/agent/configmap.yaml index 704cf5a9c5..0dd67890c4 100644 --- a/deploy/hubble/manifests/controller/helm/retina/templates/agent/configmap.yaml +++ b/deploy/hubble/manifests/controller/helm/retina/templates/agent/configmap.yaml @@ -132,7 +132,7 @@ data: metricsInterval: {{ .Values.metricsInterval }} metricsIntervalDuration: {{ .Values.metricsIntervalDuration }} enableTelemetry: {{ .Values.enableTelemetry }} - enablePodLevel: {{ .Values.enablePodLevel }} + enablePodLevel: false remoteContext: {{ .Values.remoteContext }} bypassLookupIPOfInterest: {{ .Values.bypassLookupIPOfInterest }} {{- end}} diff --git a/deploy/hubble/manifests/controller/helm/retina/values.yaml b/deploy/hubble/manifests/controller/helm/retina/values.yaml index 0717e0b500..5d93b8e6e3 100644 --- a/deploy/hubble/manifests/controller/helm/retina/values.yaml +++ b/deploy/hubble/manifests/controller/helm/retina/values.yaml @@ -90,7 +90,7 @@ logLevel: info enabledPlugin_linux: '["linuxutil","packetforward","packetparser","dns", "dropreason"]' enabledPlugin_win: '["hnsstats"]' -enableTelemetry: true +enableTelemetry: false # Interval, in duration, to scrape/publish metrics. metricsIntervalDuration: "10s" diff --git a/deploy/legacy/manifests/controller/helm/retina/templates/daemonset.yaml b/deploy/legacy/manifests/controller/helm/retina/templates/daemonset.yaml index f4e44332e0..f222ccc2fa 100644 --- a/deploy/legacy/manifests/controller/helm/retina/templates/daemonset.yaml +++ b/deploy/legacy/manifests/controller/helm/retina/templates/daemonset.yaml @@ -203,7 +203,13 @@ spec: containerPort: {{ .Values.retinaPort }} workingDir: $env:CONTAINER_SANDBOX_MOUNT_POINT command: - - controller.exe --config ./retina/config.yaml + - powershell.exe + - -command + {{- if semverCompare ">=1.28" .Capabilities.KubeVersion.GitVersion }} + - $env:CONTAINER_SANDBOX_MOUNT_POINT/controller.exe --config ./retina/config.yaml + {{- else }} + - .\setkubeconfigpath.ps1; ./controller.exe --config ./retina/config.yaml --kubeconfig ./kubeconfig + {{- end }} env: - name: POD_NAME valueFrom: diff --git a/windows/kubeconfigtemplate.yaml b/windows/kubeconfigtemplate.yaml index 21a6c054f2..49e6c8ad50 100644 --- a/windows/kubeconfigtemplate.yaml +++ b/windows/kubeconfigtemplate.yaml @@ -9,7 +9,7 @@ contexts: - name: azure-retina-windows@kubernetes context: cluster: kubernetes - namespace: {{ .Values.namespace }} + namespace: kube-system user: azure-retina-windows current-context: azure-retina-windows@kubernetes users: diff --git a/windows/manifests/windows.yaml b/windows/manifests/windows.yaml index 4fd7cb2d5a..62c78ba49c 100644 --- a/windows/manifests/windows.yaml +++ b/windows/manifests/windows.yaml @@ -4,7 +4,7 @@ metadata: labels: app: retina name: retina-win - namespace: {{ .Values.namespace }} + namespace: kube-system annotations: prometheus.io/port: "10093" prometheus.io/scrape: "true" @@ -62,7 +62,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: retina-config-win - namespace: {{ .Values.namespace }} + namespace: kube-system data: config.yaml: |- apiServer: