diff --git a/.github/workflows/daily-scale-test.yaml b/.github/workflows/daily-scale-test.yaml new file mode 100644 index 0000000000..397503c041 --- /dev/null +++ b/.github/workflows/daily-scale-test.yaml @@ -0,0 +1,24 @@ +name: Daily Scale Test + +on: + push: + branches: + - alexcastilio/scale-test-workflow + schedule: + - cron: "0 0 * * *" + +permissions: + contents: read + id-token: write + +jobs: + call-scale-test: + uses: ./.github/workflows/scale-test.yaml + with: + num_deployments: 1000 + num_replicas: 20 + # TODO: Fix values + num_netpol: 0 + num_nodes: 1000 + cleanup: false + secrets: inherit diff --git a/.github/workflows/scale-test.yaml b/.github/workflows/scale-test.yaml index 75367fff48..aa31586d97 100644 --- a/.github/workflows/scale-test.yaml +++ b/.github/workflows/scale-test.yaml @@ -15,7 +15,7 @@ on: description: "Image Namespace (if not set, default namespace will be used)" type: string image_tag: - description: "Image Tag (if not set, default for this commit will be used)" + description: "Image Tag (if not set, latest commit from 'main' will be used)" type: string num_deployments: description: "Number of Traffic Deployments" @@ -36,25 +36,21 @@ on: workflow_call: inputs: - resource_group: - description: "Azure Resource Group" - required: true - type: string - cluster_name: - description: "AKS Cluster Name" - required: true - type: string num_deployments: description: "Number of Traffic Deployments" - default: 1000 + default: 100 type: number num_replicas: description: "Number of Traffic Replicas per Deployment" - default: 40 + default: 10 type: number num_netpol: description: "Number of Network Policies" - default: 1000 + default: 100 + type: number + num_nodes: + description: "Number of nodes per pool" + default: 100 type: number cleanup: description: "Clean up environment after test" @@ -100,8 +96,10 @@ jobs: IMAGE_NAMESPACE: ${{ inputs.image_namespace == '' && github.repository || inputs.image_namespace }} TAG: ${{ inputs.image_tag }} AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }} + NODES: ${{ inputs.num_nodes }} + CREATE_INFRA: ${{ github.event_name != 'workflow_dispatch' }} shell: bash run: | set -euo pipefail - [[ $TAG == "" ]] && TAG=$(make version) - go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=false -delete-infra=false + [[ $TAG == "" ]] && TAG=$(curl -s https://api.github.com/repos/microsoft/retina/commits | jq -r '.[0].sha' | cut -c1-7) + go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CREATE_INFRA) diff --git a/test/e2e/common/common.go b/test/e2e/common/common.go index 1d4f5b00a2..19a04172b0 100644 --- a/test/e2e/common/common.go +++ b/test/e2e/common/common.go @@ -6,13 +6,13 @@ package common import ( "flag" - "os" "os/user" "path/filepath" "strconv" "testing" "time" + "github.com/microsoft/retina/test/e2e/framework/params" "github.com/stretchr/testify/require" ) @@ -31,6 +31,13 @@ var ( Architectures = []string{"amd64", "arm64"} CreateInfra = flag.Bool("create-infra", true, "create a Resource group, vNET and AKS cluster for testing") DeleteInfra = flag.Bool("delete-infra", true, "delete a Resource group, vNET and AKS cluster for testing") + ScaleTestInfra = ScaleTestInfraHandler{ + location: params.Location, + subscriptionID: params.SubscriptionID, + resourceGroup: params.ResourceGroup, + clusterName: params.ClusterName, + nodes: params.Nodes, + } // kubeconfig: path to kubeconfig file, in not provided, // a new k8s cluster will be created @@ -49,8 +56,50 @@ var ( } ) +type ScaleTestInfraHandler struct { + location string + subscriptionID string + resourceGroup string + clusterName string + nodes string +} + +func (s ScaleTestInfraHandler) GetSubscriptionID() string { + return s.subscriptionID +} + +func (s ScaleTestInfraHandler) GetLocation() string { + if s.location == "" { + return "westus2" + } + return s.location +} + +func (s ScaleTestInfraHandler) GetResourceGroup() string { + if s.resourceGroup != "" { + return s.resourceGroup + } + // Use the cluster name as the resource group name by default. + return s.GetClusterName() +} + +func (s ScaleTestInfraHandler) GetNodes() string { + if s.nodes == "" { + // Default to 100 nodes per pool + return "100" + } + return s.nodes +} + +func (s ScaleTestInfraHandler) GetClusterName() string { + if s.clusterName != "" { + return s.clusterName + } + return "retina-scale-test" +} + func ClusterNameForE2ETest(t *testing.T) string { - clusterName := os.Getenv("CLUSTER_NAME") + clusterName := params.ClusterName if clusterName == "" { curuser, err := user.Current() require.NoError(t, err) diff --git a/test/e2e/framework/azure/create-cluster.go b/test/e2e/framework/azure/create-cluster.go index 160d81fa86..161c2ae06c 100644 --- a/test/e2e/framework/azure/create-cluster.go +++ b/test/e2e/framework/azure/create-cluster.go @@ -3,6 +3,7 @@ package azure import ( "context" "fmt" + "log" "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" @@ -23,6 +24,24 @@ type CreateCluster struct { ResourceGroupName string Location string ClusterName string + podCidr string + vmSize string + networkPluginMode string + Nodes int32 +} + +func (c *CreateCluster) SetPodCidr(podCidr string) *CreateCluster { + c.podCidr = podCidr + return c +} + +func (c *CreateCluster) SetVMSize(vmSize string) *CreateCluster { + c.vmSize = vmSize + return c +} +func (c *CreateCluster) SetNetworkPluginMode(networkPluginMode string) *CreateCluster { + c.networkPluginMode = networkPluginMode + return c } func (c *CreateCluster) Run() error { @@ -36,8 +55,30 @@ func (c *CreateCluster) Run() error { if err != nil { return fmt.Errorf("failed to create client: %w", err) } + if c.Nodes == 0 { + c.Nodes = MaxNumberOfNodes + } + + template := GetStarterClusterTemplate(c.Location) + + if c.Nodes > 0 { + template.Properties.AgentPoolProfiles[0].Count = to.Ptr(c.Nodes) + } + + if c.podCidr != "" { + template.Properties.NetworkProfile.PodCidr = to.Ptr(c.podCidr) + } + + if c.vmSize != "" { + template.Properties.AgentPoolProfiles[0].VMSize = to.Ptr(c.vmSize) + } + + if c.networkPluginMode != "" { + template.Properties.NetworkProfile.NetworkPluginMode = to.Ptr(armcontainerservice.NetworkPluginMode(c.networkPluginMode)) + } - poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil) + log.Printf("creating cluster %s in location %s...", c.ClusterName, c.Location) + poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, template, nil) if err != nil { return fmt.Errorf("failed to finish the create cluster request: %w", err) } @@ -45,6 +86,7 @@ func (c *CreateCluster) Run() error { if err != nil { return fmt.Errorf("failed to pull the create cluster result: %w", err) } + log.Printf("cluster created %s in location %s...", c.ClusterName, c.Location) return nil } diff --git a/test/e2e/framework/kubernetes/create-kapinger-deployment.go b/test/e2e/framework/kubernetes/create-kapinger-deployment.go index a895625e32..7e25320d1b 100644 --- a/test/e2e/framework/kubernetes/create-kapinger-deployment.go +++ b/test/e2e/framework/kubernetes/create-kapinger-deployment.go @@ -132,7 +132,7 @@ func (c *CreateKapingerDeployment) GetKapingerDeployment() *appsv1.Deployment { Containers: []v1.Container{ { Name: "kapinger", - Image: "acnpublic.azurecr.io/kapinger:20241014.7", + Image: "acnpublic.azurecr.io/kapinger:v0.0.23-9-g23ef222", Resources: v1.ResourceRequirements{ Requests: v1.ResourceList{ "memory": resource.MustParse("20Mi"), diff --git a/test/e2e/framework/kubernetes/label-nodes.go b/test/e2e/framework/kubernetes/label-nodes.go new file mode 100644 index 0000000000..078c630457 --- /dev/null +++ b/test/e2e/framework/kubernetes/label-nodes.go @@ -0,0 +1,76 @@ +package kubernetes + +import ( + "context" + "encoding/json" + "fmt" + "log" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +type patchStringValue struct { + Op string `json:"op"` + Path string `json:"path"` + Value string `json:"value"` +} + +type LabelNodes struct { + KubeConfigFilePath string + Labels map[string]string +} + +func (l *LabelNodes) Prevalidate() error { + return nil +} + +func (l *LabelNodes) Run() error { + config, err := clientcmd.BuildConfigFromFlags("", l.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + defer cancel() + + nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to get nodes: %w", err) + } + + patch := []patchStringValue{} + for k, v := range l.Labels { + patch = append(patch, patchStringValue{ + Op: "add", + Path: "/metadata/labels/" + k, + Value: v, + }) + } + b, err := json.Marshal(patch) + if err != nil { + return fmt.Errorf("failed to marshal patch: %w", err) + } + + for i := range nodes.Items { + log.Println("Labeling node", nodes.Items[i].Name) + _, err = clientset.CoreV1().Nodes().Patch(ctx, nodes.Items[i].Name, types.JSONPatchType, b, metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("failed to patch pod: %w", err) + } + } + + return nil +} + +func (l *LabelNodes) Stop() error { + return nil +} diff --git a/test/e2e/framework/params/params.go b/test/e2e/framework/params/params.go new file mode 100644 index 0000000000..233944ae3f --- /dev/null +++ b/test/e2e/framework/params/params.go @@ -0,0 +1,17 @@ +package params + +import ( + "os" +) + +var ( + Location = os.Getenv("LOCATION") + SubscriptionID = os.Getenv("AZURE_SUBSCRIPTION_ID") + ResourceGroup = os.Getenv("AZURE_RESOURCE_GROUP") + ClusterName = os.Getenv("CLUSTER_NAME") + Nodes = os.Getenv("NODES") + NumDeployments = os.Getenv("NUM_DEPLOYMENTS") + NumReplicas = os.Getenv("NUM_REPLICAS") + NumNetworkPolicies = os.Getenv("NUM_NET_POL") + CleanUp = os.Getenv("CLEANUP") +) diff --git a/test/e2e/jobs/scale.go b/test/e2e/jobs/scale.go index 58b5d49864..aef257e155 100644 --- a/test/e2e/jobs/scale.go +++ b/test/e2e/jobs/scale.go @@ -5,6 +5,8 @@ import ( "time" "github.com/microsoft/retina/test/e2e/common" + "github.com/microsoft/retina/test/e2e/framework/azure" + "github.com/microsoft/retina/test/e2e/framework/generic" "github.com/microsoft/retina/test/e2e/framework/kubernetes" "github.com/microsoft/retina/test/e2e/framework/scaletest" "github.com/microsoft/retina/test/e2e/framework/types" @@ -45,6 +47,51 @@ func DefaultScaleTestOptions() scaletest.Options { } } +func GetScaleTestInfra(subID, rg, clusterName, location, kubeConfigFilePath string, nodes int32, createInfra bool) *types.Job { + job := types.NewJob("Get scale test infrastructure") + + if createInfra { + job.AddStep(&azure.CreateResourceGroup{ + SubscriptionID: subID, + ResourceGroupName: rg, + Location: location, + }, nil) + + job.AddStep((&azure.CreateCluster{ + ClusterName: clusterName, + Nodes: nodes, + }). + SetPodCidr("100.64.0.0/10"). + SetVMSize("Standard_D4_v3"). + SetNetworkPluginMode("overlay"), nil) + + job.AddStep(&azure.GetAKSKubeConfig{ + KubeConfigFilePath: kubeConfigFilePath, + }, nil) + + } else { + job.AddStep(&azure.GetAKSKubeConfig{ + KubeConfigFilePath: kubeConfigFilePath, + ClusterName: clusterName, + SubscriptionID: subID, + ResourceGroupName: rg, + Location: location, + }, nil) + } + + job.AddStep(&kubernetes.LabelNodes{ + Labels: map[string]string{"scale-test": "true"}, + }, nil) + + job.AddStep(&generic.LoadFlags{ + TagEnv: generic.DefaultTagEnv, + ImageNamespaceEnv: generic.DefaultImageNamespace, + ImageRegistryEnv: generic.DefaultImageRegistry, + }, nil) + + return job +} + func ScaleTest(opt *scaletest.Options) *types.Job { job := types.NewJob("Scale Test") diff --git a/test/e2e/scale_test.go b/test/e2e/scale_test.go index c5af899b1a..cddefa729d 100644 --- a/test/e2e/scale_test.go +++ b/test/e2e/scale_test.go @@ -3,8 +3,6 @@ package retina import ( - "crypto/rand" - "math/big" "os" "path/filepath" "strconv" @@ -14,6 +12,7 @@ import ( "github.com/microsoft/retina/test/e2e/framework/azure" "github.com/microsoft/retina/test/e2e/framework/generic" "github.com/microsoft/retina/test/e2e/framework/helpers" + "github.com/microsoft/retina/test/e2e/framework/params" "github.com/microsoft/retina/test/e2e/framework/types" jobs "github.com/microsoft/retina/test/e2e/jobs" "github.com/stretchr/testify/require" @@ -23,25 +22,13 @@ func TestE2ERetina_Scale(t *testing.T) { ctx, cancel := helpers.Context(t) defer cancel() - clusterName := common.ClusterNameForE2ETest(t) - - subID := os.Getenv("AZURE_SUBSCRIPTION_ID") + clusterName := common.ScaleTestInfra.GetClusterName() + subID := common.ScaleTestInfra.GetSubscriptionID() require.NotEmpty(t, subID) - - location := os.Getenv("AZURE_LOCATION") - if location == "" { - nBig, err := rand.Int(rand.Reader, big.NewInt(int64(len(common.AzureLocations)))) - if err != nil { - t.Fatal("Failed to generate a secure random index", err) - } - location = common.AzureLocations[nBig.Int64()] - } - - rg := os.Getenv("AZURE_RESOURCE_GROUP") - if rg == "" { - // Use the cluster name as the resource group name by default. - rg = clusterName - } + location := common.ScaleTestInfra.GetLocation() + rg := common.ScaleTestInfra.GetResourceGroup() + nodes, err := strconv.ParseInt(common.ScaleTestInfra.GetNodes(), 10, 32) + require.NoError(t, err, "NODES must be an integer within int32 range") cwd, err := os.Getwd() require.NoError(t, err) @@ -56,10 +43,10 @@ func TestE2ERetina_Scale(t *testing.T) { opt := jobs.DefaultScaleTestOptions() opt.KubeconfigPath = common.KubeConfigFilePath(rootDir) - NumDeployments := os.Getenv("NUM_DEPLOYMENTS") - NumReplicas := os.Getenv("NUM_REPLICAS") - NumNetworkPolicies := os.Getenv("NUM_NETPOLS") - CleanUp := os.Getenv("CLEANUP") + NumDeployments := params.NumDeployments + NumReplicas := params.NumReplicas + NumNetworkPolicies := params.NumNetworkPolicies + CleanUp := params.CleanUp if NumDeployments != "" { opt.NumRealDeployments, err = strconv.Atoi(NumDeployments) @@ -89,9 +76,11 @@ func TestE2ERetina_Scale(t *testing.T) { opt.LabelsToGetMetrics = map[string]string{"k8s-app": "retina"} + createInfra := *common.CreateInfra + // CreateTestInfra - createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, rg, clusterName, location, common.KubeConfigFilePath(rootDir), *common.CreateInfra)) - createTestInfra.Run(ctx) + infra := types.NewRunner(t, jobs.GetScaleTestInfra(subID, rg, clusterName, location, common.KubeConfigFilePath(rootDir), int32(nodes), createInfra)) + infra.Run(ctx) t.Cleanup(func() { _ = jobs.DeleteTestInfra(subID, rg, location, *common.DeleteInfra).Run()