From ef0037bc08058af13ee8b0e5798576e7ff548870 Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Mon, 20 Jan 2025 14:43:44 +0000 Subject: [PATCH 01/10] test: automate scale test execution Signed-off-by: Alex Castilio dos Santos --- .github/workflows/daily-scale-test.yaml | 24 ++++++ .github/workflows/scale-test.yaml | 26 +++---- test/e2e/common/common.go | 53 ++++++++++++- test/e2e/framework/azure/create-cluster.go | 44 ++++++++++- .../kubernetes/create-kapinger-deployment.go | 2 +- test/e2e/framework/kubernetes/label-nodes.go | 76 +++++++++++++++++++ test/e2e/framework/params/params.go | 17 +++++ .../framework/scaletest/create-resources.go | 2 +- .../scaletest/delete-and-re-add-labels.go | 49 ++++++++++-- test/e2e/jobs/scale.go | 47 ++++++++++++ test/e2e/scale_test.go | 40 ++++------ 11 files changed, 328 insertions(+), 52 deletions(-) create mode 100644 .github/workflows/daily-scale-test.yaml create mode 100644 test/e2e/framework/kubernetes/label-nodes.go create mode 100644 test/e2e/framework/params/params.go diff --git a/.github/workflows/daily-scale-test.yaml b/.github/workflows/daily-scale-test.yaml new file mode 100644 index 0000000000..397503c041 --- /dev/null +++ b/.github/workflows/daily-scale-test.yaml @@ -0,0 +1,24 @@ +name: Daily Scale Test + +on: + push: + branches: + - alexcastilio/scale-test-workflow + schedule: + - cron: "0 0 * * *" + +permissions: + contents: read + id-token: write + +jobs: + call-scale-test: + uses: ./.github/workflows/scale-test.yaml + with: + num_deployments: 1000 + num_replicas: 20 + # TODO: Fix values + num_netpol: 0 + num_nodes: 1000 + cleanup: false + secrets: inherit diff --git a/.github/workflows/scale-test.yaml b/.github/workflows/scale-test.yaml index 75367fff48..aa31586d97 100644 --- a/.github/workflows/scale-test.yaml +++ b/.github/workflows/scale-test.yaml @@ -15,7 +15,7 @@ on: description: "Image Namespace (if not set, default namespace will be used)" type: string image_tag: - description: "Image Tag (if not set, default for this commit will be used)" + description: "Image Tag (if not set, latest commit from 'main' will be used)" type: string num_deployments: description: "Number of Traffic Deployments" @@ -36,25 +36,21 @@ on: workflow_call: inputs: - resource_group: - description: "Azure Resource Group" - required: true - type: string - cluster_name: - description: "AKS Cluster Name" - required: true - type: string num_deployments: description: "Number of Traffic Deployments" - default: 1000 + default: 100 type: number num_replicas: description: "Number of Traffic Replicas per Deployment" - default: 40 + default: 10 type: number num_netpol: description: "Number of Network Policies" - default: 1000 + default: 100 + type: number + num_nodes: + description: "Number of nodes per pool" + default: 100 type: number cleanup: description: "Clean up environment after test" @@ -100,8 +96,10 @@ jobs: IMAGE_NAMESPACE: ${{ inputs.image_namespace == '' && github.repository || inputs.image_namespace }} TAG: ${{ inputs.image_tag }} AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }} + NODES: ${{ inputs.num_nodes }} + CREATE_INFRA: ${{ github.event_name != 'workflow_dispatch' }} shell: bash run: | set -euo pipefail - [[ $TAG == "" ]] && TAG=$(make version) - go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=false -delete-infra=false + [[ $TAG == "" ]] && TAG=$(curl -s https://api.github.com/repos/microsoft/retina/commits | jq -r '.[0].sha' | cut -c1-7) + go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CREATE_INFRA) diff --git a/test/e2e/common/common.go b/test/e2e/common/common.go index 1d4f5b00a2..19a04172b0 100644 --- a/test/e2e/common/common.go +++ b/test/e2e/common/common.go @@ -6,13 +6,13 @@ package common import ( "flag" - "os" "os/user" "path/filepath" "strconv" "testing" "time" + "github.com/microsoft/retina/test/e2e/framework/params" "github.com/stretchr/testify/require" ) @@ -31,6 +31,13 @@ var ( Architectures = []string{"amd64", "arm64"} CreateInfra = flag.Bool("create-infra", true, "create a Resource group, vNET and AKS cluster for testing") DeleteInfra = flag.Bool("delete-infra", true, "delete a Resource group, vNET and AKS cluster for testing") + ScaleTestInfra = ScaleTestInfraHandler{ + location: params.Location, + subscriptionID: params.SubscriptionID, + resourceGroup: params.ResourceGroup, + clusterName: params.ClusterName, + nodes: params.Nodes, + } // kubeconfig: path to kubeconfig file, in not provided, // a new k8s cluster will be created @@ -49,8 +56,50 @@ var ( } ) +type ScaleTestInfraHandler struct { + location string + subscriptionID string + resourceGroup string + clusterName string + nodes string +} + +func (s ScaleTestInfraHandler) GetSubscriptionID() string { + return s.subscriptionID +} + +func (s ScaleTestInfraHandler) GetLocation() string { + if s.location == "" { + return "westus2" + } + return s.location +} + +func (s ScaleTestInfraHandler) GetResourceGroup() string { + if s.resourceGroup != "" { + return s.resourceGroup + } + // Use the cluster name as the resource group name by default. + return s.GetClusterName() +} + +func (s ScaleTestInfraHandler) GetNodes() string { + if s.nodes == "" { + // Default to 100 nodes per pool + return "100" + } + return s.nodes +} + +func (s ScaleTestInfraHandler) GetClusterName() string { + if s.clusterName != "" { + return s.clusterName + } + return "retina-scale-test" +} + func ClusterNameForE2ETest(t *testing.T) string { - clusterName := os.Getenv("CLUSTER_NAME") + clusterName := params.ClusterName if clusterName == "" { curuser, err := user.Current() require.NoError(t, err) diff --git a/test/e2e/framework/azure/create-cluster.go b/test/e2e/framework/azure/create-cluster.go index 160d81fa86..161c2ae06c 100644 --- a/test/e2e/framework/azure/create-cluster.go +++ b/test/e2e/framework/azure/create-cluster.go @@ -3,6 +3,7 @@ package azure import ( "context" "fmt" + "log" "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" @@ -23,6 +24,24 @@ type CreateCluster struct { ResourceGroupName string Location string ClusterName string + podCidr string + vmSize string + networkPluginMode string + Nodes int32 +} + +func (c *CreateCluster) SetPodCidr(podCidr string) *CreateCluster { + c.podCidr = podCidr + return c +} + +func (c *CreateCluster) SetVMSize(vmSize string) *CreateCluster { + c.vmSize = vmSize + return c +} +func (c *CreateCluster) SetNetworkPluginMode(networkPluginMode string) *CreateCluster { + c.networkPluginMode = networkPluginMode + return c } func (c *CreateCluster) Run() error { @@ -36,8 +55,30 @@ func (c *CreateCluster) Run() error { if err != nil { return fmt.Errorf("failed to create client: %w", err) } + if c.Nodes == 0 { + c.Nodes = MaxNumberOfNodes + } + + template := GetStarterClusterTemplate(c.Location) + + if c.Nodes > 0 { + template.Properties.AgentPoolProfiles[0].Count = to.Ptr(c.Nodes) + } + + if c.podCidr != "" { + template.Properties.NetworkProfile.PodCidr = to.Ptr(c.podCidr) + } + + if c.vmSize != "" { + template.Properties.AgentPoolProfiles[0].VMSize = to.Ptr(c.vmSize) + } + + if c.networkPluginMode != "" { + template.Properties.NetworkProfile.NetworkPluginMode = to.Ptr(armcontainerservice.NetworkPluginMode(c.networkPluginMode)) + } - poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil) + log.Printf("creating cluster %s in location %s...", c.ClusterName, c.Location) + poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, template, nil) if err != nil { return fmt.Errorf("failed to finish the create cluster request: %w", err) } @@ -45,6 +86,7 @@ func (c *CreateCluster) Run() error { if err != nil { return fmt.Errorf("failed to pull the create cluster result: %w", err) } + log.Printf("cluster created %s in location %s...", c.ClusterName, c.Location) return nil } diff --git a/test/e2e/framework/kubernetes/create-kapinger-deployment.go b/test/e2e/framework/kubernetes/create-kapinger-deployment.go index a895625e32..7e25320d1b 100644 --- a/test/e2e/framework/kubernetes/create-kapinger-deployment.go +++ b/test/e2e/framework/kubernetes/create-kapinger-deployment.go @@ -132,7 +132,7 @@ func (c *CreateKapingerDeployment) GetKapingerDeployment() *appsv1.Deployment { Containers: []v1.Container{ { Name: "kapinger", - Image: "acnpublic.azurecr.io/kapinger:20241014.7", + Image: "acnpublic.azurecr.io/kapinger:v0.0.23-9-g23ef222", Resources: v1.ResourceRequirements{ Requests: v1.ResourceList{ "memory": resource.MustParse("20Mi"), diff --git a/test/e2e/framework/kubernetes/label-nodes.go b/test/e2e/framework/kubernetes/label-nodes.go new file mode 100644 index 0000000000..078c630457 --- /dev/null +++ b/test/e2e/framework/kubernetes/label-nodes.go @@ -0,0 +1,76 @@ +package kubernetes + +import ( + "context" + "encoding/json" + "fmt" + "log" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +type patchStringValue struct { + Op string `json:"op"` + Path string `json:"path"` + Value string `json:"value"` +} + +type LabelNodes struct { + KubeConfigFilePath string + Labels map[string]string +} + +func (l *LabelNodes) Prevalidate() error { + return nil +} + +func (l *LabelNodes) Run() error { + config, err := clientcmd.BuildConfigFromFlags("", l.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + defer cancel() + + nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to get nodes: %w", err) + } + + patch := []patchStringValue{} + for k, v := range l.Labels { + patch = append(patch, patchStringValue{ + Op: "add", + Path: "/metadata/labels/" + k, + Value: v, + }) + } + b, err := json.Marshal(patch) + if err != nil { + return fmt.Errorf("failed to marshal patch: %w", err) + } + + for i := range nodes.Items { + log.Println("Labeling node", nodes.Items[i].Name) + _, err = clientset.CoreV1().Nodes().Patch(ctx, nodes.Items[i].Name, types.JSONPatchType, b, metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("failed to patch pod: %w", err) + } + } + + return nil +} + +func (l *LabelNodes) Stop() error { + return nil +} diff --git a/test/e2e/framework/params/params.go b/test/e2e/framework/params/params.go new file mode 100644 index 0000000000..233944ae3f --- /dev/null +++ b/test/e2e/framework/params/params.go @@ -0,0 +1,17 @@ +package params + +import ( + "os" +) + +var ( + Location = os.Getenv("LOCATION") + SubscriptionID = os.Getenv("AZURE_SUBSCRIPTION_ID") + ResourceGroup = os.Getenv("AZURE_RESOURCE_GROUP") + ClusterName = os.Getenv("CLUSTER_NAME") + Nodes = os.Getenv("NODES") + NumDeployments = os.Getenv("NUM_DEPLOYMENTS") + NumReplicas = os.Getenv("NUM_REPLICAS") + NumNetworkPolicies = os.Getenv("NUM_NET_POL") + CleanUp = os.Getenv("CLEANUP") +) diff --git a/test/e2e/framework/scaletest/create-resources.go b/test/e2e/framework/scaletest/create-resources.go index 4057cdc826..4a8f522dc6 100644 --- a/test/e2e/framework/scaletest/create-resources.go +++ b/test/e2e/framework/scaletest/create-resources.go @@ -49,7 +49,7 @@ func (c *CreateResources) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), 1200*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 1800*time.Second) defer cancel() retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} diff --git a/test/e2e/framework/scaletest/delete-and-re-add-labels.go b/test/e2e/framework/scaletest/delete-and-re-add-labels.go index 3403ea2488..d3709880cf 100644 --- a/test/e2e/framework/scaletest/delete-and-re-add-labels.go +++ b/test/e2e/framework/scaletest/delete-and-re-add-labels.go @@ -6,6 +6,7 @@ import ( "log" "time" + "github.com/microsoft/retina/test/retry" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -48,15 +49,25 @@ func (d *DeleteAndReAddLabels) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := contextToLabelAllPods() + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() labelsToDelete := `"shared-lab-00000": null, "shared-lab-00001": null, "shared-lab-00002": null` labelsToAdd := `"shared-lab-00000": "val", "shared-lab-00001": "val", "shared-lab-00002": "val"` - pods, err := clientset.CoreV1().Pods(d.Namespace).List(ctx, metav1.ListOptions{}) + var pods *corev1.PodList + + retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} + + retrier.Do(ctx, func() error { + pods, err = clientset.CoreV1().Pods(d.Namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to list pods: %w", err) + } + return nil + }) if err != nil { - return fmt.Errorf("error listing pods: %w", err) + return fmt.Errorf("retrier failed: %w", err) } for i := 0; i < d.DeleteLabelsTimes; i++ { @@ -64,6 +75,9 @@ func (d *DeleteAndReAddLabels) Run() error { patch := fmt.Sprintf(`{"metadata": {"labels": {%s}}}`, labelsToDelete) + ctx, cancel = contextToLabelAllPods() + defer cancel() + err = d.deleteLabels(ctx, clientset, pods, patch) if err != nil { return fmt.Errorf("error deleting labels: %w", err) @@ -76,6 +90,9 @@ func (d *DeleteAndReAddLabels) Run() error { patch = fmt.Sprintf(`{"metadata": {"labels": {%s}}}`, labelsToAdd) + ctx, cancel = contextToLabelAllPods() + defer cancel() + err = d.addLabels(ctx, clientset, pods, patch) if err != nil { return fmt.Errorf("error adding labels: %w", err) @@ -92,9 +109,18 @@ func (d *DeleteAndReAddLabels) addLabels(ctx context.Context, clientset *kuberne for _, pod := range pods.Items { log.Println("Labeling Pod", pod.Name) - _, err := clientset.CoreV1().Pods(d.Namespace).Patch(ctx, pod.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}) + + retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} + err := retrier.Do(ctx, func() error { + _, err := clientset.CoreV1().Pods(d.Namespace).Patch(ctx, pod.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("could not patch pod: %w", err) + } + return nil + }) + if err != nil { - return fmt.Errorf("error patching pod: %w", err) + return fmt.Errorf("could not patch pod: %w", err) } } @@ -105,9 +131,18 @@ func (d *DeleteAndReAddLabels) deleteLabels(ctx context.Context, clientset *kube for _, pod := range pods.Items { log.Println("Deleting label from Pod", pod.Name) - _, err := clientset.CoreV1().Pods(d.Namespace).Patch(ctx, pod.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}) + + retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} + err := retrier.Do(ctx, func() error { + _, err := clientset.CoreV1().Pods(d.Namespace).Patch(ctx, pod.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("could not patch pod: %w", err) + } + return nil + }) + if err != nil { - return fmt.Errorf("error patching pod: %w", err) + return fmt.Errorf("could not patch pod: %w", err) } } return nil diff --git a/test/e2e/jobs/scale.go b/test/e2e/jobs/scale.go index 58b5d49864..aef257e155 100644 --- a/test/e2e/jobs/scale.go +++ b/test/e2e/jobs/scale.go @@ -5,6 +5,8 @@ import ( "time" "github.com/microsoft/retina/test/e2e/common" + "github.com/microsoft/retina/test/e2e/framework/azure" + "github.com/microsoft/retina/test/e2e/framework/generic" "github.com/microsoft/retina/test/e2e/framework/kubernetes" "github.com/microsoft/retina/test/e2e/framework/scaletest" "github.com/microsoft/retina/test/e2e/framework/types" @@ -45,6 +47,51 @@ func DefaultScaleTestOptions() scaletest.Options { } } +func GetScaleTestInfra(subID, rg, clusterName, location, kubeConfigFilePath string, nodes int32, createInfra bool) *types.Job { + job := types.NewJob("Get scale test infrastructure") + + if createInfra { + job.AddStep(&azure.CreateResourceGroup{ + SubscriptionID: subID, + ResourceGroupName: rg, + Location: location, + }, nil) + + job.AddStep((&azure.CreateCluster{ + ClusterName: clusterName, + Nodes: nodes, + }). + SetPodCidr("100.64.0.0/10"). + SetVMSize("Standard_D4_v3"). + SetNetworkPluginMode("overlay"), nil) + + job.AddStep(&azure.GetAKSKubeConfig{ + KubeConfigFilePath: kubeConfigFilePath, + }, nil) + + } else { + job.AddStep(&azure.GetAKSKubeConfig{ + KubeConfigFilePath: kubeConfigFilePath, + ClusterName: clusterName, + SubscriptionID: subID, + ResourceGroupName: rg, + Location: location, + }, nil) + } + + job.AddStep(&kubernetes.LabelNodes{ + Labels: map[string]string{"scale-test": "true"}, + }, nil) + + job.AddStep(&generic.LoadFlags{ + TagEnv: generic.DefaultTagEnv, + ImageNamespaceEnv: generic.DefaultImageNamespace, + ImageRegistryEnv: generic.DefaultImageRegistry, + }, nil) + + return job +} + func ScaleTest(opt *scaletest.Options) *types.Job { job := types.NewJob("Scale Test") diff --git a/test/e2e/scale_test.go b/test/e2e/scale_test.go index e8ca228767..3922971c73 100644 --- a/test/e2e/scale_test.go +++ b/test/e2e/scale_test.go @@ -3,8 +3,6 @@ package retina import ( - "crypto/rand" - "math/big" "os" "path/filepath" "strconv" @@ -14,6 +12,7 @@ import ( "github.com/microsoft/retina/test/e2e/framework/azure" "github.com/microsoft/retina/test/e2e/framework/generic" "github.com/microsoft/retina/test/e2e/framework/helpers" + "github.com/microsoft/retina/test/e2e/framework/params" "github.com/microsoft/retina/test/e2e/framework/types" jobs "github.com/microsoft/retina/test/e2e/jobs" "github.com/stretchr/testify/require" @@ -23,25 +22,13 @@ func TestE2ERetina_Scale(t *testing.T) { ctx, cancel := helpers.Context(t) defer cancel() - clusterName := common.ClusterNameForE2ETest(t) - - subID := os.Getenv("AZURE_SUBSCRIPTION_ID") + clusterName := common.ScaleTestInfra.GetClusterName() + subID := common.ScaleTestInfra.GetSubscriptionID() require.NotEmpty(t, subID) - - location := os.Getenv("AZURE_LOCATION") - if location == "" { - nBig, err := rand.Int(rand.Reader, big.NewInt(int64(len(common.AzureLocations)))) - if err != nil { - t.Fatal("Failed to generate a secure random index", err) - } - location = common.AzureLocations[nBig.Int64()] - } - - rg := os.Getenv("AZURE_RESOURCE_GROUP") - if rg == "" { - // Use the cluster name as the resource group name by default. - rg = clusterName - } + location := common.ScaleTestInfra.GetLocation() + rg := common.ScaleTestInfra.GetResourceGroup() + nodes, err := strconv.ParseInt(common.ScaleTestInfra.GetNodes(), 10, 32) + require.NoError(t, err, "NODES must be an integer within int32 range") cwd, err := os.Getwd() require.NoError(t, err) @@ -56,10 +43,10 @@ func TestE2ERetina_Scale(t *testing.T) { opt := jobs.DefaultScaleTestOptions() opt.KubeconfigPath = common.KubeConfigFilePath(rootDir) - NumDeployments := os.Getenv("NUM_DEPLOYMENTS") - NumReplicas := os.Getenv("NUM_REPLICAS") - NumNetworkPolicies := os.Getenv("NUM_NETPOLS") - CleanUp := os.Getenv("CLEANUP") + NumDeployments := params.NumDeployments + NumReplicas := params.NumReplicas + NumNetworkPolicies := params.NumNetworkPolicies + CleanUp := params.CleanUp if NumDeployments != "" { opt.NumRealDeployments, err = strconv.Atoi(NumDeployments) @@ -90,12 +77,13 @@ func TestE2ERetina_Scale(t *testing.T) { opt.LabelsToGetMetrics = map[string]string{"k8s-app": "retina"} // CreateTestInfra - createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, rg, clusterName, location, common.KubeConfigFilePath(rootDir), *common.CreateInfra)) + infra := types.NewRunner(t, jobs.GetScaleTestInfra(subID, rg, clusterName, location, common.KubeConfigFilePath(rootDir), int32(nodes), *common.CreateInfra)) + t.Cleanup(func() { _ = jobs.DeleteTestInfra(subID, rg, location, *common.DeleteInfra).Run() }) - createTestInfra.Run(ctx) + infra.Run(ctx) fqdn, err := azure.GetFqdnFn(subID, rg, clusterName) require.NoError(t, err) From 741b38e86e7af21a9d3f7b86ecc629db1c0759f4 Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Tue, 4 Feb 2025 14:27:25 +0000 Subject: [PATCH 02/10] fix(test): propagate test ctx to steps of scale test Signed-off-by: Alex Castilio dos Santos --- test/e2e/framework/kubernetes/check-pod-status.go | 11 ++++------- test/e2e/framework/scaletest/add-shared-labels.go | 8 +++----- test/e2e/framework/scaletest/add-unique-labels.go | 9 ++++----- .../framework/scaletest/create-network-policies.go | 3 ++- test/e2e/framework/scaletest/create-resources.go | 9 +++------ .../framework/scaletest/delete-and-re-add-labels.go | 13 ++++--------- test/e2e/framework/scaletest/get-publish-metrics.go | 3 ++- test/e2e/framework/scaletest/options.go | 6 +++++- test/e2e/jobs/scale.go | 9 ++++++++- test/e2e/scale_test.go | 1 + 10 files changed, 36 insertions(+), 36 deletions(-) diff --git a/test/e2e/framework/kubernetes/check-pod-status.go b/test/e2e/framework/kubernetes/check-pod-status.go index 197b32c964..7b740f9a76 100644 --- a/test/e2e/framework/kubernetes/check-pod-status.go +++ b/test/e2e/framework/kubernetes/check-pod-status.go @@ -14,14 +14,14 @@ import ( ) const ( - RetryTimeoutPodsReady = 5 * time.Minute - RetryIntervalPodsReady = 5 * time.Second - timeoutWaitForPodsSeconds = 1200 + RetryTimeoutPodsReady = 5 * time.Minute + RetryIntervalPodsReady = 5 * time.Second printInterval = 5 // print to stdout every 5 iterations ) type WaitPodsReady struct { + Ctx context.Context KubeConfigFilePath string Namespace string LabelSelector string @@ -49,10 +49,7 @@ func (w *WaitPodsReady) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), timeoutWaitForPodsSeconds*time.Second) - defer cancel() - - return WaitForPodReady(ctx, clientset, w.Namespace, w.LabelSelector) + return WaitForPodReady(w.Ctx, clientset, w.Namespace, w.LabelSelector) } // Require for background steps diff --git a/test/e2e/framework/scaletest/add-shared-labels.go b/test/e2e/framework/scaletest/add-shared-labels.go index 6a38be4f5d..759c784226 100644 --- a/test/e2e/framework/scaletest/add-shared-labels.go +++ b/test/e2e/framework/scaletest/add-shared-labels.go @@ -20,6 +20,7 @@ type patchStringValue struct { } type AddSharedLabelsToAllPods struct { + Ctx context.Context KubeConfigFilePath string NumSharedLabelsPerPod int Namespace string @@ -51,10 +52,7 @@ func (a *AddSharedLabelsToAllPods) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := contextToLabelAllPods() - defer cancel() - - resources, err := clientset.CoreV1().Pods(a.Namespace).List(ctx, metav1.ListOptions{}) + resources, err := clientset.CoreV1().Pods(a.Namespace).List(a.Ctx, metav1.ListOptions{}) patchBytes, err := getSharedLabelsPatch(a.NumSharedLabelsPerPod) if err != nil { @@ -62,7 +60,7 @@ func (a *AddSharedLabelsToAllPods) Run() error { } for _, resource := range resources.Items { - err = patchLabel(ctx, clientset, a.Namespace, resource.Name, patchBytes) + err = patchLabel(a.Ctx, clientset, a.Namespace, resource.Name, patchBytes) if err != nil { log.Printf("Error adding shared labels to pod %s: %s\n", resource.Name, err) } diff --git a/test/e2e/framework/scaletest/add-unique-labels.go b/test/e2e/framework/scaletest/add-unique-labels.go index ff85764d8f..1f67876674 100644 --- a/test/e2e/framework/scaletest/add-unique-labels.go +++ b/test/e2e/framework/scaletest/add-unique-labels.go @@ -1,6 +1,7 @@ package scaletest import ( + "context" "encoding/json" "fmt" @@ -10,6 +11,7 @@ import ( ) type AddUniqueLabelsToAllPods struct { + Ctx context.Context KubeConfigFilePath string NumUniqueLabelsPerPod int Namespace string @@ -41,10 +43,7 @@ func (a *AddUniqueLabelsToAllPods) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := contextToLabelAllPods() - defer cancel() - - resources, err := clientset.CoreV1().Pods(a.Namespace).List(ctx, metav1.ListOptions{}) + resources, err := clientset.CoreV1().Pods(a.Namespace).List(a.Ctx, metav1.ListOptions{}) count := 0 @@ -64,7 +63,7 @@ func (a *AddUniqueLabelsToAllPods) Run() error { return fmt.Errorf("failed to marshal patch: %w", err) } - err = patchLabel(ctx, clientset, a.Namespace, resource.Name, patchBytes) + err = patchLabel(a.Ctx, clientset, a.Namespace, resource.Name, patchBytes) if err != nil { return fmt.Errorf("error adding unique label to pod: %w", err) } diff --git a/test/e2e/framework/scaletest/create-network-policies.go b/test/e2e/framework/scaletest/create-network-policies.go index c38f9597cc..3cbdc32ade 100644 --- a/test/e2e/framework/scaletest/create-network-policies.go +++ b/test/e2e/framework/scaletest/create-network-policies.go @@ -16,6 +16,7 @@ import ( ) type CreateNetworkPolicies struct { + Ctx context.Context KubeConfigFilePath string Namespace string NumNetworkPolicies int @@ -45,7 +46,7 @@ func (c *CreateNetworkPolicies) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := context.WithTimeout(c.Ctx, defaultTimeoutSeconds*time.Second) defer cancel() networkPolicies := c.generateNetworkPolicies(c.NumNetworkPolicies) diff --git a/test/e2e/framework/scaletest/create-resources.go b/test/e2e/framework/scaletest/create-resources.go index 4a8f522dc6..0948c6ad15 100644 --- a/test/e2e/framework/scaletest/create-resources.go +++ b/test/e2e/framework/scaletest/create-resources.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "log" - "time" e2ekubernetes "github.com/microsoft/retina/test/e2e/framework/kubernetes" "github.com/microsoft/retina/test/retry" @@ -14,6 +13,7 @@ import ( ) type CreateResources struct { + Ctx context.Context Namespace string KubeConfigFilePath string NumKwokDeployments int @@ -49,14 +49,11 @@ func (c *CreateResources) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), 1800*time.Second) - defer cancel() - retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} for _, resource := range resources { - err := retrier.Do(ctx, func() error { - return e2ekubernetes.CreateResource(ctx, resource, clientset) + err := retrier.Do(c.Ctx, func() error { + return e2ekubernetes.CreateResource(c.Ctx, resource, clientset) }) if err != nil { return fmt.Errorf("error creating resource: %w", err) diff --git a/test/e2e/framework/scaletest/delete-and-re-add-labels.go b/test/e2e/framework/scaletest/delete-and-re-add-labels.go index d3709880cf..1742d5f37c 100644 --- a/test/e2e/framework/scaletest/delete-and-re-add-labels.go +++ b/test/e2e/framework/scaletest/delete-and-re-add-labels.go @@ -15,6 +15,7 @@ import ( ) type DeleteAndReAddLabels struct { + Ctx context.Context KubeConfigFilePath string NumSharedLabelsPerPod int DeleteLabels bool @@ -49,7 +50,7 @@ func (d *DeleteAndReAddLabels) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + ctx, cancel := context.WithTimeout(d.Ctx, 10*time.Second) defer cancel() labelsToDelete := `"shared-lab-00000": null, "shared-lab-00001": null, "shared-lab-00002": null` @@ -75,10 +76,7 @@ func (d *DeleteAndReAddLabels) Run() error { patch := fmt.Sprintf(`{"metadata": {"labels": {%s}}}`, labelsToDelete) - ctx, cancel = contextToLabelAllPods() - defer cancel() - - err = d.deleteLabels(ctx, clientset, pods, patch) + err = d.deleteLabels(d.Ctx, clientset, pods, patch) if err != nil { return fmt.Errorf("error deleting labels: %w", err) } @@ -90,10 +88,7 @@ func (d *DeleteAndReAddLabels) Run() error { patch = fmt.Sprintf(`{"metadata": {"labels": {%s}}}`, labelsToAdd) - ctx, cancel = contextToLabelAllPods() - defer cancel() - - err = d.addLabels(ctx, clientset, pods, patch) + err = d.addLabels(d.Ctx, clientset, pods, patch) if err != nil { return fmt.Errorf("error adding labels: %w", err) } diff --git a/test/e2e/framework/scaletest/get-publish-metrics.go b/test/e2e/framework/scaletest/get-publish-metrics.go index b8e168b239..935aab9ced 100644 --- a/test/e2e/framework/scaletest/get-publish-metrics.go +++ b/test/e2e/framework/scaletest/get-publish-metrics.go @@ -31,6 +31,7 @@ const ( ) type GetAndPublishMetrics struct { + Ctx context.Context KubeConfigFilePath string AdditionalTelemetryProperty map[string]string Labels map[string]string @@ -136,7 +137,7 @@ func (g *GetAndPublishMetrics) Prevalidate() error { func (g *GetAndPublishMetrics) getAndPublishMetrics() error { - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := context.WithTimeout(g.Ctx, defaultTimeoutSeconds*time.Second) defer cancel() labelSelector := labels.Set(g.Labels).String() diff --git a/test/e2e/framework/scaletest/options.go b/test/e2e/framework/scaletest/options.go index a7d27683b6..03dfcdfe22 100644 --- a/test/e2e/framework/scaletest/options.go +++ b/test/e2e/framework/scaletest/options.go @@ -1,9 +1,13 @@ package scaletest -import "time" +import ( + "context" + "time" +) // Options holds parameters for the scale test type Options struct { + Ctx context.Context Namespace string MaxKwokPodsPerNode int NumKwokDeployments int diff --git a/test/e2e/jobs/scale.go b/test/e2e/jobs/scale.go index aef257e155..041fcc3f3d 100644 --- a/test/e2e/jobs/scale.go +++ b/test/e2e/jobs/scale.go @@ -59,7 +59,7 @@ func GetScaleTestInfra(subID, rg, clusterName, location, kubeConfigFilePath stri job.AddStep((&azure.CreateCluster{ ClusterName: clusterName, - Nodes: nodes, + Nodes: nodes, }). SetPodCidr("100.64.0.0/10"). SetVMSize("Standard_D4_v3"). @@ -115,6 +115,7 @@ func ScaleTest(opt *scaletest.Options) *types.Job { // There's a known limitation on leaving empty fields in Steps. // Set methods are used to set private fields and keep environment variables accessed within jobs, rather then spread through steps. job.AddStep((&scaletest.GetAndPublishMetrics{ + Ctx: opt.Ctx, Labels: opt.LabelsToGetMetrics, AdditionalTelemetryProperty: opt.AdditionalTelemetryProperty, }). @@ -126,6 +127,7 @@ func ScaleTest(opt *scaletest.Options) *types.Job { }) job.AddStep(&scaletest.CreateResources{ + Ctx: opt.Ctx, NumKwokDeployments: opt.NumKwokDeployments, NumKwokReplicas: opt.NumKwokReplicas, RealPodType: opt.RealPodType, @@ -136,24 +138,29 @@ func ScaleTest(opt *scaletest.Options) *types.Job { }, nil) job.AddStep(&scaletest.AddSharedLabelsToAllPods{ + Ctx: opt.Ctx, NumSharedLabelsPerPod: opt.NumSharedLabelsPerPod, }, nil) job.AddStep(&scaletest.AddUniqueLabelsToAllPods{ + Ctx: opt.Ctx, NumUniqueLabelsPerPod: opt.NumUniqueLabelsPerPod, }, nil) // Apply network policies (applied and unapplied) job.AddStep(&scaletest.CreateNetworkPolicies{ + Ctx: opt.Ctx, NumNetworkPolicies: opt.NumNetworkPolicies, NumSharedLabelsPerPod: opt.NumSharedLabelsPerPod, }, nil) job.AddStep(&kubernetes.WaitPodsReady{ + Ctx: opt.Ctx, LabelSelector: "is-real=true", }, nil) job.AddStep(&scaletest.DeleteAndReAddLabels{ + Ctx: opt.Ctx, DeleteLabels: opt.DeleteLabels, DeleteLabelsInterval: opt.DeleteLabelsInterval, DeleteLabelsTimes: opt.DeleteLabelsTimes, diff --git a/test/e2e/scale_test.go b/test/e2e/scale_test.go index 3922971c73..efdf7002c7 100644 --- a/test/e2e/scale_test.go +++ b/test/e2e/scale_test.go @@ -42,6 +42,7 @@ func TestE2ERetina_Scale(t *testing.T) { // Scale test parameters opt := jobs.DefaultScaleTestOptions() opt.KubeconfigPath = common.KubeConfigFilePath(rootDir) + opt.Ctx = ctx NumDeployments := params.NumDeployments NumReplicas := params.NumReplicas From 6e8b13d33a08bba50562aa1784e338631696b1d7 Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Tue, 4 Feb 2025 15:17:39 +0000 Subject: [PATCH 03/10] fix(test): add retrier to step LabelNodes Signed-off-by: Alex Castilio dos Santos --- test/e2e/framework/kubernetes/label-nodes.go | 25 ++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/test/e2e/framework/kubernetes/label-nodes.go b/test/e2e/framework/kubernetes/label-nodes.go index 078c630457..2d2d2bfc58 100644 --- a/test/e2e/framework/kubernetes/label-nodes.go +++ b/test/e2e/framework/kubernetes/label-nodes.go @@ -7,6 +7,8 @@ import ( "log" "time" + retry "github.com/microsoft/retina/test/retry" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -42,9 +44,18 @@ func (l *LabelNodes) Run() error { ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) defer cancel() - nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + var nodes *corev1.NodeList + + retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} + retrier.Do(ctx, func() error { + nodes, err = clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to get nodes: %w", err) + } + return nil + }) if err != nil { - return fmt.Errorf("failed to get nodes: %w", err) + return fmt.Errorf("retrier failed: %w", err) } patch := []patchStringValue{} @@ -62,9 +73,15 @@ func (l *LabelNodes) Run() error { for i := range nodes.Items { log.Println("Labeling node", nodes.Items[i].Name) - _, err = clientset.CoreV1().Nodes().Patch(ctx, nodes.Items[i].Name, types.JSONPatchType, b, metav1.PatchOptions{}) + retrier.Do(ctx, func() error { + _, err = clientset.CoreV1().Nodes().Patch(ctx, nodes.Items[i].Name, types.JSONPatchType, b, metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("failed to patch pod: %w", err) + } + return nil + }) if err != nil { - return fmt.Errorf("failed to patch pod: %w", err) + return fmt.Errorf("retrier failed: %w", err) } } From 2f1b9c0702867e28241919eb0e0e591f25dd2673 Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Tue, 4 Feb 2025 16:57:36 +0000 Subject: [PATCH 04/10] fix(test): adjust defaultRetryDelay for retriers Signed-off-by: Alex Castilio dos Santos --- test/e2e/framework/scaletest/get-publish-metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/framework/scaletest/get-publish-metrics.go b/test/e2e/framework/scaletest/get-publish-metrics.go index 935aab9ced..2d73de4eba 100644 --- a/test/e2e/framework/scaletest/get-publish-metrics.go +++ b/test/e2e/framework/scaletest/get-publish-metrics.go @@ -26,7 +26,7 @@ import ( const ( defaultRetryAttempts = 10 - defaultRetryDelay = 500 * time.Millisecond + defaultRetryDelay = 3 * time.Second defaultInterval = 2 * time.Minute ) From 056a666b6b4ec7c5effbbc027b34c81c8f50518d Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Tue, 4 Feb 2025 17:46:49 +0000 Subject: [PATCH 05/10] fix(test): adjust timeout for step LabelNodes Signed-off-by: Alex Castilio dos Santos --- test/e2e/framework/kubernetes/label-nodes.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/framework/kubernetes/label-nodes.go b/test/e2e/framework/kubernetes/label-nodes.go index 2d2d2bfc58..e02b5607cc 100644 --- a/test/e2e/framework/kubernetes/label-nodes.go +++ b/test/e2e/framework/kubernetes/label-nodes.go @@ -41,7 +41,7 @@ func (l *LabelNodes) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) defer cancel() var nodes *corev1.NodeList From 2c0f6825093c447919e87421b7c8df415506c6ad Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Wed, 5 Feb 2025 09:17:52 +0000 Subject: [PATCH 06/10] fix(test): small fixes Signed-off-by: Alex Castilio dos Santos --- .github/workflows/daily-scale-test.yaml | 6 +----- .github/workflows/scale-test.yaml | 6 +++--- test/e2e/framework/azure/create-cluster.go | 1 + test/e2e/framework/kubernetes/label-nodes.go | 4 ++-- test/e2e/framework/scaletest/add-shared-labels.go | 3 +++ test/e2e/framework/scaletest/add-unique-labels.go | 3 +++ test/e2e/framework/scaletest/delete-and-re-add-labels.go | 6 +----- 7 files changed, 14 insertions(+), 15 deletions(-) diff --git a/.github/workflows/daily-scale-test.yaml b/.github/workflows/daily-scale-test.yaml index 397503c041..c62b353973 100644 --- a/.github/workflows/daily-scale-test.yaml +++ b/.github/workflows/daily-scale-test.yaml @@ -1,9 +1,6 @@ name: Daily Scale Test on: - push: - branches: - - alexcastilio/scale-test-workflow schedule: - cron: "0 0 * * *" @@ -17,8 +14,7 @@ jobs: with: num_deployments: 1000 num_replicas: 20 - # TODO: Fix values num_netpol: 0 num_nodes: 1000 - cleanup: false + cleanup: true secrets: inherit diff --git a/.github/workflows/scale-test.yaml b/.github/workflows/scale-test.yaml index aa31586d97..060b673338 100644 --- a/.github/workflows/scale-test.yaml +++ b/.github/workflows/scale-test.yaml @@ -46,10 +46,10 @@ on: type: number num_netpol: description: "Number of Network Policies" - default: 100 + default: 0 type: number num_nodes: - description: "Number of nodes per pool" + description: "Number of nodes" default: 100 type: number cleanup: @@ -102,4 +102,4 @@ jobs: run: | set -euo pipefail [[ $TAG == "" ]] && TAG=$(curl -s https://api.github.com/repos/microsoft/retina/commits | jq -r '.[0].sha' | cut -c1-7) - go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CREATE_INFRA) + go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CLEANUP) diff --git a/test/e2e/framework/azure/create-cluster.go b/test/e2e/framework/azure/create-cluster.go index 161c2ae06c..5c252ac703 100644 --- a/test/e2e/framework/azure/create-cluster.go +++ b/test/e2e/framework/azure/create-cluster.go @@ -39,6 +39,7 @@ func (c *CreateCluster) SetVMSize(vmSize string) *CreateCluster { c.vmSize = vmSize return c } + func (c *CreateCluster) SetNetworkPluginMode(networkPluginMode string) *CreateCluster { c.networkPluginMode = networkPluginMode return c diff --git a/test/e2e/framework/kubernetes/label-nodes.go b/test/e2e/framework/kubernetes/label-nodes.go index e02b5607cc..12f2d46c92 100644 --- a/test/e2e/framework/kubernetes/label-nodes.go +++ b/test/e2e/framework/kubernetes/label-nodes.go @@ -47,7 +47,7 @@ func (l *LabelNodes) Run() error { var nodes *corev1.NodeList retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} - retrier.Do(ctx, func() error { + err = retrier.Do(ctx, func() error { nodes, err = clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return fmt.Errorf("failed to get nodes: %w", err) @@ -73,7 +73,7 @@ func (l *LabelNodes) Run() error { for i := range nodes.Items { log.Println("Labeling node", nodes.Items[i].Name) - retrier.Do(ctx, func() error { + err = retrier.Do(ctx, func() error { _, err = clientset.CoreV1().Nodes().Patch(ctx, nodes.Items[i].Name, types.JSONPatchType, b, metav1.PatchOptions{}) if err != nil { return fmt.Errorf("failed to patch pod: %w", err) diff --git a/test/e2e/framework/scaletest/add-shared-labels.go b/test/e2e/framework/scaletest/add-shared-labels.go index 759c784226..c1581c0c72 100644 --- a/test/e2e/framework/scaletest/add-shared-labels.go +++ b/test/e2e/framework/scaletest/add-shared-labels.go @@ -53,6 +53,9 @@ func (a *AddSharedLabelsToAllPods) Run() error { } resources, err := clientset.CoreV1().Pods(a.Namespace).List(a.Ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to list pods: %w", err) + } patchBytes, err := getSharedLabelsPatch(a.NumSharedLabelsPerPod) if err != nil { diff --git a/test/e2e/framework/scaletest/add-unique-labels.go b/test/e2e/framework/scaletest/add-unique-labels.go index 1f67876674..2362a7d66c 100644 --- a/test/e2e/framework/scaletest/add-unique-labels.go +++ b/test/e2e/framework/scaletest/add-unique-labels.go @@ -44,6 +44,9 @@ func (a *AddUniqueLabelsToAllPods) Run() error { } resources, err := clientset.CoreV1().Pods(a.Namespace).List(a.Ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to list pods: %w", err) + } count := 0 diff --git a/test/e2e/framework/scaletest/delete-and-re-add-labels.go b/test/e2e/framework/scaletest/delete-and-re-add-labels.go index 1742d5f37c..e7377d7c84 100644 --- a/test/e2e/framework/scaletest/delete-and-re-add-labels.go +++ b/test/e2e/framework/scaletest/delete-and-re-add-labels.go @@ -35,7 +35,6 @@ func (d *DeleteAndReAddLabels) Prevalidate() error { // Primary step where test logic is executed // Returning an error will cause the test to fail func (d *DeleteAndReAddLabels) Run() error { - if d.NumSharedLabelsPerPod <= 2 || !d.DeleteLabels { return nil } @@ -60,7 +59,7 @@ func (d *DeleteAndReAddLabels) Run() error { retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} - retrier.Do(ctx, func() error { + err = retrier.Do(ctx, func() error { pods, err = clientset.CoreV1().Pods(d.Namespace).List(ctx, metav1.ListOptions{}) if err != nil { return fmt.Errorf("failed to list pods: %w", err) @@ -101,7 +100,6 @@ func (d *DeleteAndReAddLabels) Run() error { } func (d *DeleteAndReAddLabels) addLabels(ctx context.Context, clientset *kubernetes.Clientset, pods *corev1.PodList, patch string) error { - for _, pod := range pods.Items { log.Println("Labeling Pod", pod.Name) @@ -113,7 +111,6 @@ func (d *DeleteAndReAddLabels) addLabels(ctx context.Context, clientset *kuberne } return nil }) - if err != nil { return fmt.Errorf("could not patch pod: %w", err) } @@ -135,7 +132,6 @@ func (d *DeleteAndReAddLabels) deleteLabels(ctx context.Context, clientset *kube } return nil }) - if err != nil { return fmt.Errorf("could not patch pod: %w", err) } From 7d44787359ae65913fcf181be14f8fcf5707f53f Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Thu, 6 Feb 2025 11:06:13 +0000 Subject: [PATCH 07/10] fix(test): change kapinger memory limit to 20Mi Signed-off-by: Alex Castilio dos Santos --- test/e2e/framework/kubernetes/create-kapinger-deployment.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/framework/kubernetes/create-kapinger-deployment.go b/test/e2e/framework/kubernetes/create-kapinger-deployment.go index 7e25320d1b..9d5a74354a 100644 --- a/test/e2e/framework/kubernetes/create-kapinger-deployment.go +++ b/test/e2e/framework/kubernetes/create-kapinger-deployment.go @@ -138,7 +138,7 @@ func (c *CreateKapingerDeployment) GetKapingerDeployment() *appsv1.Deployment { "memory": resource.MustParse("20Mi"), }, Limits: v1.ResourceList{ - "memory": resource.MustParse("100Mi"), + "memory": resource.MustParse("20Mi"), }, }, Ports: []v1.ContainerPort{ From 13c5e07dc585328a4ea6a3f92f332eaeeffc6416 Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Thu, 6 Feb 2025 12:14:22 +0000 Subject: [PATCH 08/10] fix(test): Notify ctx to be cancelled on OS interrupt and termination signals Signed-off-by: Alex Castilio dos Santos --- test/e2e/framework/helpers/helpers.go | 7 ++++++- test/e2e/framework/types/runner.go | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/test/e2e/framework/helpers/helpers.go b/test/e2e/framework/helpers/helpers.go index 5102d7cae4..d47c2f87b9 100644 --- a/test/e2e/framework/helpers/helpers.go +++ b/test/e2e/framework/helpers/helpers.go @@ -2,6 +2,8 @@ package helpers import ( "context" + "os/signal" + "syscall" "testing" "time" ) @@ -20,5 +22,8 @@ func Context(t *testing.T) (context.Context, context.CancelFunc) { // Subtract a minute from the deadline to ensure we have time to cleanup deadline = deadline.Add(-time.Minute) - return context.WithDeadline(context.Background(), deadline) + ctx, cancel := context.WithDeadline(context.Background(), deadline) + ctx, cancel = signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM) + + return ctx, cancel } diff --git a/test/e2e/framework/types/runner.go b/test/e2e/framework/types/runner.go index 241205dc26..ac094ea5cd 100644 --- a/test/e2e/framework/types/runner.go +++ b/test/e2e/framework/types/runner.go @@ -33,7 +33,7 @@ func (r *Runner) Run(ctx context.Context) { }() select { case <-ctx.Done(): - r.t.Fatal("Test deadline exceeded. If more time is needed, set -timeout flag to a higher value") + r.t.Fatal("Failed to complete execution:", ctx.Err()) case err := <-runComplete: require.NoError(r.t, err) } From c05ce6fabbd531e460fe12f7d0c2b91692f0cd51 Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Thu, 6 Feb 2025 12:16:14 +0000 Subject: [PATCH 09/10] fix(test): add trigger to run workflow on push to PR branch Signed-off-by: Alex Castilio dos Santos --- .github/workflows/daily-scale-test.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/daily-scale-test.yaml b/.github/workflows/daily-scale-test.yaml index c62b353973..4b38e41ec5 100644 --- a/.github/workflows/daily-scale-test.yaml +++ b/.github/workflows/daily-scale-test.yaml @@ -1,6 +1,9 @@ name: Daily Scale Test on: + push: + branches: + - alexcastilio/scale-test-workflow schedule: - cron: "0 0 * * *" @@ -16,5 +19,6 @@ jobs: num_replicas: 20 num_netpol: 0 num_nodes: 1000 - cleanup: true + # TODO: Change to true after checking cluster + cleanup: false secrets: inherit From 4b0866f1b58437c57aaa386147b4a649d7c38037 Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Thu, 6 Feb 2025 16:40:47 +0000 Subject: [PATCH 10/10] fix(test): fix propagation of signals when manually cancelling workflow execution Signed-off-by: Alex Castilio dos Santos --- .github/workflows/scale-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scale-test.yaml b/.github/workflows/scale-test.yaml index 060b673338..7b64d8798e 100644 --- a/.github/workflows/scale-test.yaml +++ b/.github/workflows/scale-test.yaml @@ -102,4 +102,4 @@ jobs: run: | set -euo pipefail [[ $TAG == "" ]] && TAG=$(curl -s https://api.github.com/repos/microsoft/retina/commits | jq -r '.[0].sha' | cut -c1-7) - go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CLEANUP) + exec go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CLEANUP)