Skip to content

Commit

Permalink
test: automate scale test execution
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Castilio dos Santos <[email protected]>
  • Loading branch information
alexcastilio committed Jan 21, 2025
1 parent b3cd0ec commit c997428
Show file tree
Hide file tree
Showing 11 changed files with 354 additions and 56 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/daily-scale-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Daily Scale Test

on:
push:
branches:
- alexcastilio/scale-test-workflow
schedule:
- cron: "0 0 * * *"

permissions:
contents: read
id-token: write

jobs:
call-scale-test:
uses: ./.github/workflows/scale-test.yaml
with:
num_deployments: 300
num_replicas: 100
# TODO: Fix values
num_netpol: 300
num_nodes: 300
cleanup: false
secrets: inherit
26 changes: 12 additions & 14 deletions .github/workflows/scale-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ on:
description: "Image Namespace (if not set, default namespace will be used)"
type: string
image_tag:
description: "Image Tag (if not set, default for this commit will be used)"
description: "Image Tag (if not set, latest commit from 'main' will be used)"
type: string
num_deployments:
description: "Number of Traffic Deployments"
Expand All @@ -36,25 +36,21 @@ on:

workflow_call:
inputs:
resource_group:
description: "Azure Resource Group"
required: true
type: string
cluster_name:
description: "AKS Cluster Name"
required: true
type: string
num_deployments:
description: "Number of Traffic Deployments"
default: 1000
default: 100
type: number
num_replicas:
description: "Number of Traffic Replicas per Deployment"
default: 40
default: 10
type: number
num_netpol:
description: "Number of Network Policies"
default: 1000
default: 100
type: number
num_nodes:
description: "Number of nodes per pool"
default: 100
type: number
cleanup:
description: "Clean up environment after test"
Expand Down Expand Up @@ -100,8 +96,10 @@ jobs:
IMAGE_NAMESPACE: ${{ github.repository }}
TAG: ${{ inputs.image_tag }}
AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }}
NODES_PER_POOL: ${{ inputs.num_nodes }}
CREATE_INFRA: ${{ github.event_name != 'workflow_dispatch' }}
shell: bash
run: |
set -euo pipefail
[[ $TAG == "" ]] && TAG=$(make version)
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=false -delete-infra=false
[[ $TAG == "" ]] && TAG=$(curl -s https://api.github.com/repos/microsoft/retina/commits | jq -r '.[0].sha' | cut -c1-7)
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CREATE_INFRA)
56 changes: 54 additions & 2 deletions test/e2e/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ package common

import (
"flag"
"os"
"os/user"
"strconv"
"testing"
"time"

"github.com/microsoft/retina/test/e2e/framework/params"
"github.com/stretchr/testify/require"
)

Expand All @@ -30,10 +30,62 @@ var (
Architectures = []string{"amd64", "arm64"}
CreateInfra = flag.Bool("create-infra", true, "create a Resource group, vNET and AKS cluster for testing")
DeleteInfra = flag.Bool("delete-infra", true, "delete a Resource group, vNET and AKS cluster for testing")
ScaleTestInfra = ScaleTestInfraHandler{
location: params.Location,
subscriptionID: params.SubscriptionID,
resourceGroup: params.ResourceGroup,
clusterName: params.ClusterName,
nodesPerPool: params.NodesPerPool,
}
)

type ScaleTestInfraHandler struct {
location string
subscriptionID string
resourceGroup string
clusterName string
nodesPerPool string
}

func (s ScaleTestInfraHandler) GetSubscriptionID(t *testing.T) string {
require.NotEmpty(t, s.subscriptionID)
return s.subscriptionID
}

func (s ScaleTestInfraHandler) GetLocation(t *testing.T) string {

Check failure on line 55 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 55 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 55 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 55 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)
if s.location == "" {
return "westus2"
}
return s.location
}

func (s ScaleTestInfraHandler) GetResourceGroup(t *testing.T) string {
if s.resourceGroup != "" {
return s.resourceGroup
}
// Use the cluster name as the resource group name by default.
return s.GetClusterName(t)
}

func (s ScaleTestInfraHandler) GetNodesPerPool(t *testing.T) int32 {
if s.nodesPerPool == "" {
// Default to 100 nodes per pool
return 100
}
nodesPerPool, err := strconv.Atoi(s.nodesPerPool)
require.NoError(t, err, "NODES_PER_POOL must be an integer")
return int32(nodesPerPool)

Check failure on line 77 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

G109: Potential Integer overflow made by strconv.Atoi result conversion to int16/32 (gosec)

Check failure on line 77 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

G109: Potential Integer overflow made by strconv.Atoi result conversion to int16/32 (gosec)

Check failure on line 77 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

G109: Potential Integer overflow made by strconv.Atoi result conversion to int16/32 (gosec)

Check failure on line 77 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

G109: Potential Integer overflow made by strconv.Atoi result conversion to int16/32 (gosec)

Check failure

Code scanning / CodeQL

Incorrect conversion between integer types High test

Incorrect conversion of an integer with architecture-dependent bit size from
strconv.Atoi
to a lower bit size type int32 without an upper bound check.
}

func (s ScaleTestInfraHandler) GetClusterName(t *testing.T) string {

Check failure on line 80 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 80 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 80 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)

Check failure on line 80 in test/e2e/common/common.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

unused-parameter: parameter 't' seems to be unused, consider removing or renaming it as _ (revive)
if s.clusterName != "" {
return s.clusterName
}
return "retina-scale-test"
}

func ClusterNameForE2ETest(t *testing.T) string {
clusterName := os.Getenv("CLUSTER_NAME")
clusterName := params.ClusterName
if clusterName == "" {
curuser, err := user.Current()
require.NoError(t, err)
Expand Down
29 changes: 23 additions & 6 deletions test/e2e/framework/azure/create-cluster-with-npm.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ var (
)

const (
largeClusterTimeout = 30 * time.Minute
clusterTimeout = 15 * time.Minute
clusterCreateTicker = 30 * time.Second
pollFrequency = 5 * time.Second
AgentARMSKU = "Standard_D4pls_v5"
AuxilaryNodeCount = 1
AgentARMSKU = "Standard_D4pls_v6"
)

type CreateNPMCluster struct {
Expand All @@ -35,6 +35,7 @@ type CreateNPMCluster struct {
PodCidr string
DNSServiceIP string
ServiceCidr string
NodesPerPool int32
}

func (c *CreateNPMCluster) Prevalidate() error {
Expand All @@ -47,15 +48,21 @@ func (c *CreateNPMCluster) Stop() error {

func (c *CreateNPMCluster) Run() error {
// Start with default cluster template
npmCluster := GetStarterClusterTemplate(c.Location)
npmCluster := GetStarterClusterTemplate(c.Location, c.NodesPerPool)

npmCluster.Properties.NetworkProfile.NetworkPolicy = to.Ptr(armcontainerservice.NetworkPolicyAzure)
npmCluster.Properties.NetworkProfile.PodCidr = to.Ptr(c.PodCidr)
npmCluster.Properties.NetworkProfile.ServiceCidr = to.Ptr(c.ServiceCidr)

// podSubnetId := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s", c.SubscriptionID, c.ResourceGroupName, c.VnetName, c.SubnetName)

Check failure on line 57 in test/e2e/framework/azure/create-cluster-with-npm.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

commentedOutCode: may want to remove commented-out code (gocritic)

Check failure on line 57 in test/e2e/framework/azure/create-cluster-with-npm.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

commentedOutCode: may want to remove commented-out code (gocritic)

Check failure on line 57 in test/e2e/framework/azure/create-cluster-with-npm.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

commentedOutCode: may want to remove commented-out code (gocritic)

Check failure on line 57 in test/e2e/framework/azure/create-cluster-with-npm.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

commentedOutCode: may want to remove commented-out code (gocritic)
// vnetSubnetId := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s", c.SubscriptionID, c.ResourceGroupName, c.VnetName, c.SubnetName)
// npmCluster.Properties.AgentPoolProfiles[0].PodSubnetID = to.Ptr(podSubnetId)

//nolint:appendCombine // separate for verbosity
npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all
Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
// AvailabilityZones: []*string{to.Ptr("1")},
Count: to.Ptr[int32](AuxilaryNodeCount),
Count: to.Ptr[int32](c.NodesPerPool),
EnableNodePublicIP: to.Ptr(false),
Mode: to.Ptr(armcontainerservice.AgentPoolModeUser),
OSType: to.Ptr(armcontainerservice.OSTypeWindows),
Expand All @@ -64,6 +71,8 @@ func (c *CreateNPMCluster) Run() error {
VMSize: to.Ptr(AgentSKU),
Name: to.Ptr("ws22"),
MaxPods: to.Ptr(int32(MaxPodsPerNode)),
// VnetSubnetID: to.Ptr(vnetSubnetId),
// PodSubnetID: to.Ptr(podSubnetId),
})

/* todo: add azlinux node pool
Expand All @@ -86,14 +95,15 @@ func (c *CreateNPMCluster) Run() error {
npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all
Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
// AvailabilityZones: []*string{to.Ptr("1")},
Count: to.Ptr[int32](AuxilaryNodeCount),
Count: to.Ptr[int32](c.NodesPerPool),
EnableNodePublicIP: to.Ptr(false),
Mode: to.Ptr(armcontainerservice.AgentPoolModeUser),
OSType: to.Ptr(armcontainerservice.OSTypeLinux),
ScaleDownMode: to.Ptr(armcontainerservice.ScaleDownModeDelete),
VMSize: to.Ptr(AgentARMSKU),
Name: to.Ptr("arm64"),
MaxPods: to.Ptr(int32(MaxPodsPerNode)),
// PodSubnetID: to.Ptr(podSubnetId),
})

npmCluster.Properties.AutoUpgradeProfile = &armcontainerservice.ManagedClusterAutoUpgradeProfile{
Expand All @@ -105,7 +115,14 @@ func (c *CreateNPMCluster) Run() error {
if err != nil {
return fmt.Errorf("failed to obtain a credential: %w", err)
}
ctx, cancel := context.WithTimeout(context.Background(), clusterTimeout)

var timeout time.Duration
if c.NodesPerPool > 20 {
timeout = largeClusterTimeout
} else {
timeout = clusterTimeout
}
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()

clientFactory, err := armcontainerservice.NewClientFactory(c.SubscriptionID, cred, nil)
Expand Down
45 changes: 39 additions & 6 deletions test/e2e/framework/azure/create-cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,24 @@ type CreateCluster struct {
ResourceGroupName string
Location string
ClusterName string
podCidr string
vmSize string
networkPluginMode string
NodesPerPool int32
}

func (c *CreateCluster) SetPodCidr(podCidr string) *CreateCluster {
c.podCidr = podCidr
return c
}

func (c *CreateCluster) SetVmSize(vmSize string) *CreateCluster {

Check failure on line 37 in test/e2e/framework/azure/create-cluster.go

View workflow job for this annotation

GitHub Actions / Lint (windows, arm64)

var-naming: method SetVmSize should be SetVMSize (revive)

Check failure on line 37 in test/e2e/framework/azure/create-cluster.go

View workflow job for this annotation

GitHub Actions / Lint (windows, amd64)

var-naming: method SetVmSize should be SetVMSize (revive)

Check failure on line 37 in test/e2e/framework/azure/create-cluster.go

View workflow job for this annotation

GitHub Actions / Lint (linux, arm64)

var-naming: method SetVmSize should be SetVMSize (revive)

Check failure on line 37 in test/e2e/framework/azure/create-cluster.go

View workflow job for this annotation

GitHub Actions / Lint (linux, amd64)

var-naming: method SetVmSize should be SetVMSize (revive)
c.vmSize = vmSize
return c
}
func (c *CreateCluster) SetNetworkPluginMode(networkPluginMode string) *CreateCluster {
c.networkPluginMode = networkPluginMode
return c
}

func (c *CreateCluster) Run() error {
Expand All @@ -36,8 +54,22 @@ func (c *CreateCluster) Run() error {
if err != nil {
return fmt.Errorf("failed to create client: %w", err)
}
if c.NodesPerPool == 0 {
c.NodesPerPool = MaxNumberOfNodes
}

template := GetStarterClusterTemplate(c.Location, c.NodesPerPool)
if c.podCidr != "" {
template.Properties.NetworkProfile.PodCidr = to.Ptr(c.podCidr)
}
if c.vmSize != "" {
template.Properties.AgentPoolProfiles[0].VMSize = to.Ptr(c.vmSize)
}
if c.networkPluginMode != "" {
template.Properties.NetworkProfile.NetworkPluginMode = to.Ptr(armcontainerservice.NetworkPluginMode(c.networkPluginMode))
}

poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil)
poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, template, nil)
if err != nil {
return fmt.Errorf("failed to finish the create cluster request: %w", err)
}
Expand All @@ -49,7 +81,7 @@ func (c *CreateCluster) Run() error {
return nil
}

func GetStarterClusterTemplate(location string) armcontainerservice.ManagedCluster {
func GetStarterClusterTemplate(location string, numOfNodes int32) armcontainerservice.ManagedCluster {
id := armcontainerservice.ResourceIdentityTypeSystemAssigned
return armcontainerservice.ManagedCluster{
Location: to.Ptr(location),
Expand All @@ -70,7 +102,7 @@ func GetStarterClusterTemplate(location string) armcontainerservice.ManagedClust
{
Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
// AvailabilityZones: []*string{to.Ptr("1")},
Count: to.Ptr[int32](MaxNumberOfNodes),
Count: to.Ptr[int32](numOfNodes),
EnableNodePublicIP: to.Ptr(false),
Mode: to.Ptr(armcontainerservice.AgentPoolModeSystem),
OSType: to.Ptr(armcontainerservice.OSTypeLinux),
Expand All @@ -86,9 +118,10 @@ func GetStarterClusterTemplate(location string) armcontainerservice.ManagedClust
EnableRBAC: to.Ptr(true),
LinuxProfile: nil,
NetworkProfile: &armcontainerservice.NetworkProfile{
LoadBalancerSKU: to.Ptr(armcontainerservice.LoadBalancerSKUStandard),
OutboundType: to.Ptr(armcontainerservice.OutboundTypeLoadBalancer),
NetworkPlugin: to.Ptr(armcontainerservice.NetworkPluginAzure),
LoadBalancerSKU: to.Ptr(armcontainerservice.LoadBalancerSKUStandard),
OutboundType: to.Ptr(armcontainerservice.OutboundTypeLoadBalancer),
NetworkPlugin: to.Ptr(armcontainerservice.NetworkPluginAzure),
NetworkPluginMode: to.Ptr(armcontainerservice.NetworkPluginModeOverlay),
},
WindowsProfile: &armcontainerservice.ManagedClusterWindowsProfile{
AdminPassword: to.Ptr("replacePassword1234$"),
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/framework/azure/enable-ama.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ az aks update --enable-azure-monitor-metrics \
return fmt.Errorf("failed to write cluster JSON to file for AMA: %w", err)
}

poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil)
poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location, MaxNumberOfNodes), nil)
if err != nil {
return fmt.Errorf("failed to finish the update cluster request for AMA: %w", err)
}
Expand Down
Loading

0 comments on commit c997428

Please sign in to comment.