-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathaxlearn-job.yml
52 lines (51 loc) · 1.75 KB
/
axlearn-job.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
apiVersion: batch/v1
kind: Job
metadata:
name: PLACEHOLDER
labels:
kueue.x-k8s.io/queue-name: p5-queue
spec:
completions: 1
parallelism: 1
template:
spec:
restartPolicy: Never
containers:
- name: axlearn
image: PLACEHOLDER
command:
- bash
- -xo
- pipefail
- -c
- |
test-axlearn.sh \
--directory "." \
--output "/opt/output/" \
--test-files "/opt/axlearn/axlearn/common/*_test.py"
sync
wait
# copy results to the mounted s3 bucket
mkdir -p /jax-toolbox-eks-output/axlearn/${RUN_ID}
cp /opt/output/summary.txt /jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
# copy all the log files
cp /opt/output/*.log /jax-toolbox-eks-output/axlearn/${RUN_ID}/.
env:
- name: RUN_ID
value: PLACEHOLDER
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: output
mountPath: /opt/output
- name: s3-storage
mountPath: /jax-toolbox-eks-output
imagePullSecrets:
- name: PLACEHOLDER
volumes:
- name: output
emptyDir: {}
- name: s3-storage
persistentVolumeClaim:
claimName: s3-pvc