Skip to content

Commit 395a799

Browse files
authored
fix: resiliency and logging for network benchmark tests (#8)
1 parent 6f29c97 commit 395a799

File tree

4 files changed

+54
-15
lines changed

4 files changed

+54
-15
lines changed

network/benchmarks/netperf/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
all: docker push launch runtests
1616

1717
repo_owner := $(shell echo $(REPO_OWNER) | tr '[:upper:]' '[:lower:]')
18-
dockerrepo := $(if $(repo_owner), ghcr.io/$(repo_owner)/nptest, girishkalele/netperf-latest)
18+
dockerrepo := $(if $(repo_owner),ghcr.io/$(repo_owner)/nptest,girishkalele/netperf-latest)
1919
image_tag := $(or $(IMAGE_TAG), latest)
2020

2121
docker: test

network/benchmarks/netperf/lib/outputlib.go

+10-1
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,23 @@ import (
88
"time"
99

1010
api "k8s.io/api/core/v1"
11+
"k8s.io/apimachinery/pkg/util/wait"
1112
"k8s.io/client-go/kubernetes"
1213
"k8s.io/client-go/util/retry"
1314
)
1415

1516
func getLogsFromPod(c *kubernetes.Clientset, podName, testNamespace string) (*string, error) {
1617
var logData *string
1718

18-
err := retry.OnError(retry.DefaultBackoff, func(err error) bool {
19+
// Retry to get logs from the pod, as we are polling at intervals
20+
// and there might be intermittent network issues, a long retry time
21+
// is acceptable.
22+
err := retry.OnError(wait.Backoff{
23+
Steps: 5,
24+
Duration: 2 * time.Second,
25+
Factor: 2.0,
26+
Jitter: 100,
27+
}, func(err error) bool {
1928
return true
2029
}, func() error {
2130
body, err := c.CoreV1().Pods(testNamespace).GetLogs(podName, &api.PodLogOptions{}).DoRaw(context.Background())

network/benchmarks/netperf/lib/utilslib.go

+41-13
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package lib
33
import (
44
"context"
55
"fmt"
6-
"strings"
76
"time"
87

98
api "k8s.io/api/core/v1"
@@ -242,7 +241,10 @@ func executeTests(c *kubernetes.Clientset, testParams TestParams, primaryNode, s
242241
}
243242
fmt.Println("Waiting for netperf pods to start up")
244243

245-
orchestratorPodName := getOrchestratorPodName(c, testParams.TestNamespace)
244+
orchestratorPodName, err := getOrchestratorPodName(c, testParams.TestNamespace, 3*time.Minute)
245+
if err != nil {
246+
return nil, fmt.Errorf("failed to get orchestrator pod name: %v", err)
247+
}
246248
fmt.Println("Orchestrator Pod is", orchestratorPodName)
247249

248250
var jsonFilePath string
@@ -290,19 +292,45 @@ func executeTests(c *kubernetes.Clientset, testParams TestParams, primaryNode, s
290292
return results, nil
291293
}
292294

293-
func getOrchestratorPodName(c *kubernetes.Clientset, testNamespace string) string {
295+
func getOrchestratorPodName(c *kubernetes.Clientset, testNamespace string, timeout time.Duration) (string, error) {
296+
timeoutCh := time.After(timeout)
297+
ticker := time.NewTicker(5 * time.Second)
298+
defer ticker.Stop()
299+
294300
for {
295-
fmt.Println("Waiting for orchestrator pod creation")
296-
time.Sleep(60 * time.Second)
297-
pods, err := c.CoreV1().Pods(testNamespace).List(context.Background(), everythingSelector)
298-
if err != nil {
299-
fmt.Println("Failed to fetch pods - waiting for pod creation", err)
300-
continue
301-
}
302-
for _, pod := range pods.Items {
303-
if strings.Contains(pod.GetName(), "netperf-orch-") {
304-
return pod.GetName()
301+
select {
302+
case <-ticker.C:
303+
fmt.Println("Waiting for orchestrator pod creation")
304+
pods, err := c.CoreV1().Pods(testNamespace).List(context.Background(), metav1.ListOptions{
305+
LabelSelector: "app=netperf-orch",
306+
})
307+
if err != nil {
308+
fmt.Println("Failed to fetch pods - waiting for pod creation", err)
309+
continue
310+
}
311+
if len(pods.Items) == 0 {
312+
fmt.Println("No orchestrator pods found yet")
313+
continue
314+
}
315+
316+
pod := pods.Items[0]
317+
podStatus := pod.Status
318+
319+
if podStatus.Phase == api.PodRunning {
320+
return pod.GetName(), nil
321+
}
322+
323+
for _, containerStatus := range podStatus.ContainerStatuses {
324+
if waiting := containerStatus.State.Waiting; waiting != nil {
325+
switch waiting.Reason {
326+
case "ErrImagePull", "CrashLoopBackOff", "ImagePullBackOff":
327+
return "", fmt.Errorf("orchestrator pod error: %s - %v", waiting.Reason, waiting.Message)
328+
}
329+
}
305330
}
331+
fmt.Println("Orchestrator pod is not running yet")
332+
case <-timeoutCh:
333+
return "", fmt.Errorf("timed out waiting for orchestrator pod to be created")
306334
}
307335
}
308336
}

network/benchmarks/netperf/nptest/Dockerfile

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ RUN go build -o nptests
3131
FROM debian:bullseye
3232
ENV LD_LIBRARY_PATH=/usr/local/lib
3333

34+
LABEL org.opencontainers.image.description "Network performance tests in k8s engine"
35+
3436
# install binary and remove cache
3537
RUN apt-get update \
3638
&& apt-get install -y curl wget net-tools gcc make libsctp-dev git autotools-dev automake \

0 commit comments

Comments
 (0)