Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RayService] Trim Redis Cleanup job less than 63 chars #2846

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1143,14 +1143,15 @@ func (r *RayClusterReconciler) buildWorkerPod(ctx context.Context, instance rayv
func (r *RayClusterReconciler) buildRedisCleanupJob(ctx context.Context, instance rayv1.RayCluster) batchv1.Job {
logger := ctrl.LoggerFrom(ctx)

// Build the head pod
pod := r.buildHeadPod(ctx, instance)
pod.Labels[utils.RayNodeTypeLabelKey] = string(rayv1.RedisCleanupNode)

// Only keep the Ray container in the Redis cleanup Job.
pod.Spec.Containers = []corev1.Container{pod.Spec.Containers[utils.RayContainerIndex]}
pod.Spec.Containers[utils.RayContainerIndex].Command = []string{"/bin/bash", "-lc", "--"}
pod.Spec.Containers[utils.RayContainerIndex].Args = []string{
"echo \"To get more information about manually delete the storage namespace in Redis and remove the RayCluster's finalizer, please check https://docs.ray.io/en/master/cluster/kubernetes/user-guides/kuberay-gcs-ft.html for more details.\" && " +
"echo \"To get more information about manually deleting the storage namespace in Redis and removing the RayCluster's finalizer, please check https://docs.ray.io/en/master/cluster/kubernetes/user-guides/kuberay-gcs-ft.html for more details.\" && " +
"python -c " +
"\"from ray._private.gcs_utils import cleanup_redis_storage; " +
"from urllib.parse import urlparse; " +
Expand Down Expand Up @@ -1180,8 +1181,8 @@ func (r *RayClusterReconciler) buildRedisCleanupJob(ctx context.Context, instanc
Value: "500",
})

// The container's resource consumption remains constant. so hard-coding the resources is acceptable.
// In addition, avoid using the GPU for the Redis cleanup Job.
// The container's resource consumption remains constant. Hard-coding the resources is acceptable.
// Avoid using the GPU for the Redis cleanup Job.
pod.Spec.Containers[utils.RayContainerIndex].Resources = corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("200m"),
Expand All @@ -1196,9 +1197,12 @@ func (r *RayClusterReconciler) buildRedisCleanupJob(ctx context.Context, instanc
// For Kubernetes Job, the valid values for Pod's `RestartPolicy` are `Never` and `OnFailure`.
pod.Spec.RestartPolicy = corev1.RestartPolicyNever

// Trim the job name to ensure it is within the 63-character limit.
jobName := utils.TrimJobName(fmt.Sprintf("%s-%s", instance.Name, "redis-cleanup"))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note for future: the job name is only relevant when RayClusters are being deleted, so changing the name here is not a breaking change for existing KubeRay deployments since no existing RayCluster should rely on this name


redisCleanupJob := batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-%s", instance.Name, "redis-cleanup"),
Name: jobName,
Namespace: instance.Namespace,
Labels: pod.Labels,
Annotations: pod.Annotations,
Expand All @@ -1209,7 +1213,7 @@ func (r *RayClusterReconciler) buildRedisCleanupJob(ctx context.Context, instanc
ObjectMeta: pod.ObjectMeta,
Spec: pod.Spec,
},
// make this job be best-effort only for 5 minutes.
// Make this job best-effort only for 5 minutes.
ActiveDeadlineSeconds: ptr.To[int64](300),
},
}
Expand Down
5 changes: 5 additions & 0 deletions ray-operator/controllers/ray/utils/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,11 @@ func CheckName(s string) string {
return s
}

// TrimJobName uses CheckLabel to trim Kubernetes job to constrains
func TrimJobName(jobName string) string {
return CheckLabel(jobName)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a follow-up PR, I would create a private function trimString and then call trimString from both TrimJobName and CheckLabels

}

// CheckLabel makes sure the label value does not start with a punctuation and the total length is < 63 char
func CheckLabel(s string) string {
maxLenght := 63
Expand Down
14 changes: 14 additions & 0 deletions ray-operator/test/e2e/raycluster_gcsft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,20 @@ func TestGcsFaultToleranceOptions(t *testing.T) {
},
createSecret: true,
},
{
name: "Long RayCluster Name",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rueian @aviadshimoni do you know if this e2e test actually exercises the clean-up job? I guess it does because at the end it waits for RayCluster deletion to succeed?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it does. defer g.Eventually(checkRedisDBSize, time.Second*30, time.Second).Should(BeEquivalentTo("0")) makes sure the redis is cleaned up.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks!

redisPassword: "",
rayClusterFn: func(namespace string) *rayv1ac.RayClusterApplyConfiguration {
// Intentionally using a long name to test job name trimming
return rayv1ac.RayCluster("raycluster-with-a-very-long-name-exceeding-k8s-limit", namespace).WithSpec(
newRayClusterSpec().WithGcsFaultToleranceOptions(
rayv1ac.GcsFaultToleranceOptions().
WithRedisAddress("redis:6379"),
),
)
},
createSecret: false,
},
}

for _, tc := range testCases {
Expand Down
Loading