From b4284fc0aadba49058ea17ce3a6900a46a4e60b6 Mon Sep 17 00:00:00 2001 From: Alex Castilio dos Santos Date: Mon, 20 Jan 2025 14:43:44 +0000 Subject: [PATCH] test: automate scale test execution Signed-off-by: Alex Castilio dos Santos --- .github/workflows/daily-scale-test.yaml | 24 +++++ .github/workflows/scale-test.yaml | 26 +++--- test/e2e/common/common.go | 56 ++++++++++- .../azure/create-cluster-with-npm.go | 10 +- test/e2e/framework/azure/create-cluster.go | 6 +- test/e2e/framework/azure/enable-ama.go | 2 +- test/e2e/framework/kubernetes/label-nodes.go | 76 +++++++++++++++ test/e2e/framework/params/params.go | 17 ++++ test/e2e/jobs/jobs.go | 1 + test/e2e/jobs/scale.go | 92 +++++++++++++++++++ test/e2e/scale_test.go | 41 +++------ 11 files changed, 299 insertions(+), 52 deletions(-) create mode 100644 .github/workflows/daily-scale-test.yaml create mode 100644 test/e2e/framework/kubernetes/label-nodes.go create mode 100644 test/e2e/framework/params/params.go diff --git a/.github/workflows/daily-scale-test.yaml b/.github/workflows/daily-scale-test.yaml new file mode 100644 index 0000000000..9aeea0b2f6 --- /dev/null +++ b/.github/workflows/daily-scale-test.yaml @@ -0,0 +1,24 @@ +name: Daily Scale Test + +on: + push: + branches: + - alexcastilio/scale-test-workflow + # schedule: + # - cron: "0 0 * * *" + +permissions: + contents: read + id-token: write + +jobs: + call-scale-test: + uses: ./.github/workflows/scale-test.yaml + with: + num_deployments: 300 + num_replicas: 100 + # TODO: Fix values + num_netpol: 300 + # num_nodes: 100 + cleanup: false + secrets: inherit diff --git a/.github/workflows/scale-test.yaml b/.github/workflows/scale-test.yaml index 6ce87b25a8..56245cc503 100644 --- a/.github/workflows/scale-test.yaml +++ b/.github/workflows/scale-test.yaml @@ -15,7 +15,7 @@ on: description: "Image Namespace (if not set, default namespace will be used)" type: string image_tag: - description: "Image Tag (if not set, default for this commit will be used)" + description: "Image Tag (if not set, latest commit from 'main' will be used)" type: string num_deployments: description: "Number of Traffic Deployments" @@ -36,25 +36,21 @@ on: workflow_call: inputs: - resource_group: - description: "Azure Resource Group" - required: true - type: string - cluster_name: - description: "AKS Cluster Name" - required: true - type: string num_deployments: description: "Number of Traffic Deployments" - default: 1000 + default: 100 type: number num_replicas: description: "Number of Traffic Replicas per Deployment" - default: 40 + default: 10 type: number num_netpol: description: "Number of Network Policies" - default: 1000 + default: 100 + type: number + num_nodes: + description: "Number of nodes per pool" + default: 100 type: number cleanup: description: "Clean up environment after test" @@ -100,8 +96,10 @@ jobs: IMAGE_NAMESPACE: ${{ github.repository }} TAG: ${{ inputs.image_tag }} AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }} + NODES_PER_POOL: ${{ inputs.num_nodes }} + CREATE_INFRA: ${{ github.event_name != 'workflow_dispatch' }} shell: bash run: | set -euo pipefail - [[ $TAG == "" ]] && TAG=$(make version) - go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=false -delete-infra=false + [[ $TAG == "" ]] && TAG=$(curl -s https://api.github.com/repos/microsoft/retina/commits | jq -r '.[0].sha' | cut -c1-7) + go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CREATE_INFRA) diff --git a/test/e2e/common/common.go b/test/e2e/common/common.go index 2aa29f05dd..d72ba7dd5f 100644 --- a/test/e2e/common/common.go +++ b/test/e2e/common/common.go @@ -6,12 +6,12 @@ package common import ( "flag" - "os" "os/user" "strconv" "testing" "time" + "github.com/microsoft/retina/test/e2e/framework/params" "github.com/stretchr/testify/require" ) @@ -30,10 +30,62 @@ var ( Architectures = []string{"amd64", "arm64"} CreateInfra = flag.Bool("create-infra", true, "create a Resource group, vNET and AKS cluster for testing") DeleteInfra = flag.Bool("delete-infra", true, "delete a Resource group, vNET and AKS cluster for testing") + ScaleTestInfra = ScaleTestInfraHandler{ + location: params.Location, + subscriptionID: params.SubscriptionID, + resourceGroup: params.ResourceGroup, + clusterName: params.ClusterName, + nodesPerPool: params.NodesPerPool, + } ) +type ScaleTestInfraHandler struct { + location string + subscriptionID string + resourceGroup string + clusterName string + nodesPerPool string +} + +func (s ScaleTestInfraHandler) GetSubscriptionID(t *testing.T) string { + require.NotEmpty(t, s.subscriptionID) + return s.subscriptionID +} + +func (s ScaleTestInfraHandler) GetLocation(t *testing.T) string { + if s.location == "" { + return "eastus2" + } + return s.location +} + +func (s ScaleTestInfraHandler) GetResourceGroup(t *testing.T) string { + if s.resourceGroup != "" { + return s.resourceGroup + } + // Use the cluster name as the resource group name by default. + return s.GetClusterName(t) +} + +func (s ScaleTestInfraHandler) GetNodesPerPool(t *testing.T) int32 { + if s.nodesPerPool == "" { + // Default to 100 nodes per pool + return 100 + } + nodesPerPool, err := strconv.Atoi(s.nodesPerPool) + require.NoError(t, err, "NODES_PER_POOL must be an integer") + return int32(nodesPerPool) +} + +func (s ScaleTestInfraHandler) GetClusterName(t *testing.T) string { + if s.clusterName != "" { + return s.clusterName + } + return "retina-scale-test" +} + func ClusterNameForE2ETest(t *testing.T) string { - clusterName := os.Getenv("CLUSTER_NAME") + clusterName := params.ClusterName if clusterName == "" { curuser, err := user.Current() require.NoError(t, err) diff --git a/test/e2e/framework/azure/create-cluster-with-npm.go b/test/e2e/framework/azure/create-cluster-with-npm.go index fe9ea656f6..1f1a449fb8 100644 --- a/test/e2e/framework/azure/create-cluster-with-npm.go +++ b/test/e2e/framework/azure/create-cluster-with-npm.go @@ -21,8 +21,7 @@ const ( clusterTimeout = 15 * time.Minute clusterCreateTicker = 30 * time.Second pollFrequency = 5 * time.Second - AgentARMSKU = "Standard_D4pls_v5" - AuxilaryNodeCount = 1 + AgentARMSKU = "Standard_D4pls_v6" ) type CreateNPMCluster struct { @@ -35,6 +34,7 @@ type CreateNPMCluster struct { PodCidr string DNSServiceIP string ServiceCidr string + NodesPerPool int32 } func (c *CreateNPMCluster) Prevalidate() error { @@ -47,7 +47,7 @@ func (c *CreateNPMCluster) Stop() error { func (c *CreateNPMCluster) Run() error { // Start with default cluster template - npmCluster := GetStarterClusterTemplate(c.Location) + npmCluster := GetStarterClusterTemplate(c.Location, c.NodesPerPool) npmCluster.Properties.NetworkProfile.NetworkPolicy = to.Ptr(armcontainerservice.NetworkPolicyAzure) @@ -55,7 +55,7 @@ func (c *CreateNPMCluster) Run() error { npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), // AvailabilityZones: []*string{to.Ptr("1")}, - Count: to.Ptr[int32](AuxilaryNodeCount), + Count: to.Ptr[int32](c.NodesPerPool), EnableNodePublicIP: to.Ptr(false), Mode: to.Ptr(armcontainerservice.AgentPoolModeUser), OSType: to.Ptr(armcontainerservice.OSTypeWindows), @@ -86,7 +86,7 @@ func (c *CreateNPMCluster) Run() error { npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), // AvailabilityZones: []*string{to.Ptr("1")}, - Count: to.Ptr[int32](AuxilaryNodeCount), + Count: to.Ptr[int32](c.NodesPerPool), EnableNodePublicIP: to.Ptr(false), Mode: to.Ptr(armcontainerservice.AgentPoolModeUser), OSType: to.Ptr(armcontainerservice.OSTypeLinux), diff --git a/test/e2e/framework/azure/create-cluster.go b/test/e2e/framework/azure/create-cluster.go index 160d81fa86..98ebbd67ee 100644 --- a/test/e2e/framework/azure/create-cluster.go +++ b/test/e2e/framework/azure/create-cluster.go @@ -37,7 +37,7 @@ func (c *CreateCluster) Run() error { return fmt.Errorf("failed to create client: %w", err) } - poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil) + poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location, MaxNumberOfNodes), nil) if err != nil { return fmt.Errorf("failed to finish the create cluster request: %w", err) } @@ -49,7 +49,7 @@ func (c *CreateCluster) Run() error { return nil } -func GetStarterClusterTemplate(location string) armcontainerservice.ManagedCluster { +func GetStarterClusterTemplate(location string, numOfNodes int32) armcontainerservice.ManagedCluster { id := armcontainerservice.ResourceIdentityTypeSystemAssigned return armcontainerservice.ManagedCluster{ Location: to.Ptr(location), @@ -70,7 +70,7 @@ func GetStarterClusterTemplate(location string) armcontainerservice.ManagedClust { Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), // AvailabilityZones: []*string{to.Ptr("1")}, - Count: to.Ptr[int32](MaxNumberOfNodes), + Count: to.Ptr[int32](numOfNodes), EnableNodePublicIP: to.Ptr(false), Mode: to.Ptr(armcontainerservice.AgentPoolModeSystem), OSType: to.Ptr(armcontainerservice.OSTypeLinux), diff --git a/test/e2e/framework/azure/enable-ama.go b/test/e2e/framework/azure/enable-ama.go index 5dcd89ed5a..aface80165 100644 --- a/test/e2e/framework/azure/enable-ama.go +++ b/test/e2e/framework/azure/enable-ama.go @@ -95,7 +95,7 @@ az aks update --enable-azure-monitor-metrics \ return fmt.Errorf("failed to write cluster JSON to file for AMA: %w", err) } - poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil) + poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location, MaxNumberOfNodes), nil) if err != nil { return fmt.Errorf("failed to finish the update cluster request for AMA: %w", err) } diff --git a/test/e2e/framework/kubernetes/label-nodes.go b/test/e2e/framework/kubernetes/label-nodes.go new file mode 100644 index 0000000000..87ab72ae47 --- /dev/null +++ b/test/e2e/framework/kubernetes/label-nodes.go @@ -0,0 +1,76 @@ +package kubernetes + +import ( + "context" + "encoding/json" + "log" + "fmt" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +type patchStringValue struct { + Op string `json:"op"` + Path string `json:"path"` + Value string `json:"value"` +} + +type LabelNodes struct { + KubeConfigFilePath string + Labels map[string]string +} + +func (l *LabelNodes) Prevalidate() error { + return nil +} + +func (l *LabelNodes) Run() error { + config, err := clientcmd.BuildConfigFromFlags("", l.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + defer cancel() + + nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to get nodes: %w", err) + } + + patch := []patchStringValue{} + for k, v := range l.Labels { + patch = append(patch, patchStringValue{ + Op: "add", + Path: "/metadata/labels/" + k, + Value: v, + }) + } + b, err := json.Marshal(patch) + if err != nil { + return fmt.Errorf("failed to marshal patch: %w", err) + } + + for _, node := range nodes.Items { + log.Println("Labeling node", node.Name) + _, err = clientset.CoreV1().Nodes().Patch(ctx, node.Name, types.JSONPatchType, b, metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("failed to patch pod: %w", err) + } + } + + return nil +} + +func (l *LabelNodes) Stop() error { + return nil +} diff --git a/test/e2e/framework/params/params.go b/test/e2e/framework/params/params.go new file mode 100644 index 0000000000..88706d1e9f --- /dev/null +++ b/test/e2e/framework/params/params.go @@ -0,0 +1,17 @@ +package params + +import ( + "os" +) + +var ( + Location = os.Getenv("LOCATION") + SubscriptionID = os.Getenv("AZURE_SUBSCRIPTION_ID") + ResourceGroup = os.Getenv("AZURE_RESOURCE_GROUP") + ClusterName = os.Getenv("CLUSTER_NAME") + NodesPerPool = os.Getenv("NODES_PER_POOL") + NumDeployments = os.Getenv("NUM_DEPLOYMENTS") + NumReplicas = os.Getenv("NUM_REPLICAS") + NumNetworkPolicies = os.Getenv("NUM_NET_POL") + CleanUp = os.Getenv("CLEANUP") +) diff --git a/test/e2e/jobs/jobs.go b/test/e2e/jobs/jobs.go index 375f69aa4e..92f6e55953 100644 --- a/test/e2e/jobs/jobs.go +++ b/test/e2e/jobs/jobs.go @@ -43,6 +43,7 @@ func CreateTestInfra(subID, rg, clusterName, location, kubeConfigFilePath string PodCidr: "10.128.0.0/9", DNSServiceIP: "192.168.0.10", ServiceCidr: "192.168.0.0/28", + NodesPerPool: 1, }, nil) job.AddStep(&azure.GetAKSKubeConfig{ diff --git a/test/e2e/jobs/scale.go b/test/e2e/jobs/scale.go index 58b5d49864..f5ef000676 100644 --- a/test/e2e/jobs/scale.go +++ b/test/e2e/jobs/scale.go @@ -5,12 +5,49 @@ import ( "time" "github.com/microsoft/retina/test/e2e/common" + "github.com/microsoft/retina/test/e2e/framework/azure" + "github.com/microsoft/retina/test/e2e/framework/generic" "github.com/microsoft/retina/test/e2e/framework/kubernetes" "github.com/microsoft/retina/test/e2e/framework/scaletest" "github.com/microsoft/retina/test/e2e/framework/types" ) func DefaultScaleTestOptions() scaletest.Options { + // var NumRealDeployments int + // var NumReplicas int + // var err error + // + // if env.NumDeployments != "" { + // NumRealDeployments, err = strconv.Atoi(env.NumDeployments) + // require.NoError(t, err, "Failed to convert NUM_DEPLOYMENTS to int") + // } else { + // NumRealDeployments = 1000 + // } + // + // if env.NumReplicas != "" { + // NumReplicas, err = strconv.Atoi(env.NumReplicas) + // require.NoError(t, err, "Failed to convert NUM_REPLICAS to int") + // } else { + // NumReplicas = "40" + // NumNetworkPolicies := env.NumNetworkPolicies + // CleanUp := env.CleanUp + // + // if NumDeployments != "" { + // } else { + // NumRealDeployments = 1000 + // } + // if NumReplicas != "" { + // opt.NumRealReplicas, err = strconv.Atoi(NumReplicas) + // require.NoError(t, err) + // } + // if NumNetworkPolicies != "" { + // opt.NumNetworkPolicies, err = strconv.Atoi(NumNetworkPolicies) + // require.NoError(t, err) + // } + // if CleanUp != "" { + // opt.DeleteLabels, err = strconv.ParseBool(CleanUp) + // require.NoError(t, err) + // } return scaletest.Options{ Namespace: "scale-test", MaxKwokPodsPerNode: 0, @@ -45,6 +82,61 @@ func DefaultScaleTestOptions() scaletest.Options { } } +func GetScaleTestInfra(subID, rg, clusterName, location, kubeConfigFilePath string, nodesPerPool int32, createInfra bool) *types.Job { + job := types.NewJob("Get scale test infrastructure") + + if createInfra { + job.AddStep(&azure.CreateResourceGroup{ + SubscriptionID: subID, + ResourceGroupName: rg, + Location: location, + }, nil) + + job.AddStep(&azure.CreateVNet{ + VnetName: "testvnet", + VnetAddressSpace: "10.0.0.0/9", + }, nil) + + job.AddStep(&azure.CreateSubnet{ + SubnetName: "testsubnet", + SubnetAddressSpace: "10.0.0.0/12", + }, nil) + + job.AddStep(&azure.CreateNPMCluster{ + ClusterName: clusterName, + PodCidr: "10.128.0.0/9", + DNSServiceIP: "192.168.0.10", + ServiceCidr: "192.168.0.0/28", + NodesPerPool: nodesPerPool, + }, nil) + + job.AddStep(&azure.GetAKSKubeConfig{ + KubeConfigFilePath: kubeConfigFilePath, + }, nil) + + } else { + job.AddStep(&azure.GetAKSKubeConfig{ + KubeConfigFilePath: kubeConfigFilePath, + ClusterName: clusterName, + SubscriptionID: subID, + ResourceGroupName: rg, + Location: location, + }, nil) + } + + job.AddStep(&kubernetes.LabelNodes{ + Labels: map[string]string{"scale-test": "true"}, + }, nil) + + job.AddStep(&generic.LoadFlags{ + TagEnv: generic.DefaultTagEnv, + ImageNamespaceEnv: generic.DefaultImageNamespace, + ImageRegistryEnv: generic.DefaultImageRegistry, + }, nil) + + return job +} + func ScaleTest(opt *scaletest.Options) *types.Job { job := types.NewJob("Scale Test") diff --git a/test/e2e/scale_test.go b/test/e2e/scale_test.go index 687d32ceb5..984b3fdd19 100644 --- a/test/e2e/scale_test.go +++ b/test/e2e/scale_test.go @@ -3,8 +3,6 @@ package retina import ( - "crypto/rand" - "math/big" "os" "path/filepath" "strconv" @@ -14,6 +12,7 @@ import ( "github.com/microsoft/retina/test/e2e/framework/azure" "github.com/microsoft/retina/test/e2e/framework/generic" "github.com/microsoft/retina/test/e2e/framework/helpers" + "github.com/microsoft/retina/test/e2e/framework/params" "github.com/microsoft/retina/test/e2e/framework/types" jobs "github.com/microsoft/retina/test/e2e/jobs" "github.com/stretchr/testify/require" @@ -23,25 +22,11 @@ func TestE2ERetina_Scale(t *testing.T) { ctx, cancel := helpers.Context(t) defer cancel() - clusterName := common.ClusterNameForE2ETest(t) - - subID := os.Getenv("AZURE_SUBSCRIPTION_ID") - require.NotEmpty(t, subID) - - location := os.Getenv("AZURE_LOCATION") - if location == "" { - nBig, err := rand.Int(rand.Reader, big.NewInt(int64(len(common.AzureLocations)))) - if err != nil { - t.Fatal("Failed to generate a secure random index", err) - } - location = common.AzureLocations[nBig.Int64()] - } - - rg := os.Getenv("AZURE_RESOURCE_GROUP") - if rg == "" { - // Use the cluster name as the resource group name by default. - rg = clusterName - } + clusterName := common.ScaleTestInfra.GetClusterName(t) + subID := common.ScaleTestInfra.GetSubscriptionID(t) + location := common.ScaleTestInfra.GetLocation(t) + rg := common.ScaleTestInfra.GetResourceGroup(t) + nodesPerPool := common.ScaleTestInfra.GetNodesPerPool(t) cwd, err := os.Getwd() require.NoError(t, err) @@ -56,10 +41,10 @@ func TestE2ERetina_Scale(t *testing.T) { opt := jobs.DefaultScaleTestOptions() opt.KubeconfigPath = kubeConfigFilePath - NumDeployments := os.Getenv("NUM_DEPLOYMENTS") - NumReplicas := os.Getenv("NUM_REPLICAS") - NumNetworkPolicies := os.Getenv("NUM_NETPOLS") - CleanUp := os.Getenv("CLEANUP") + NumDeployments := params.NumDeployments + NumReplicas := params.NumReplicas + NumNetworkPolicies := params.NumNetworkPolicies + CleanUp := params.CleanUp if NumDeployments != "" { opt.NumRealDeployments, err = strconv.Atoi(NumDeployments) @@ -89,9 +74,11 @@ func TestE2ERetina_Scale(t *testing.T) { opt.LabelsToGetMetrics = map[string]string{"k8s-app": "retina"} + createInfra := *common.CreateInfra + // CreateTestInfra - createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, rg, clusterName, location, kubeConfigFilePath, *common.CreateInfra)) - createTestInfra.Run(ctx) + infra := types.NewRunner(t, jobs.GetScaleTestInfra(subID, rg, clusterName, location, kubeConfigFilePath, nodesPerPool, createInfra)) + infra.Run(ctx) t.Cleanup(func() { if *common.DeleteInfra {