Skip to content

Commit 5e5ef25

Browse files
committed
Merge branch 'emma/add_draco_oci_ci' into 'main'
Add draco oci ci See merge request dl/hugectr/hugectr!1525
2 parents e5f021f + e7637f6 commit 5e5ef25

File tree

7 files changed

+568
-109
lines changed

7 files changed

+568
-109
lines changed

.gitlab-ci.yml

+51-93
Original file line numberDiff line numberDiff line change
@@ -221,15 +221,15 @@ build_pytorch_hps_trt_plugin:
221221
variables:
222222
FROM_IMAGE: ${IMAGE_PYTORCH}
223223
DST_IMAGE: $PYTORCH_TRT_IMAGE_VERSIONED
224-
BUILD_TORCH_PLUGIN: 1
224+
BUILD_TORCH_PLUGIN: 1
225225
BUILD_TRT_PLUGIN: 1
226226
TRT_CMAKE_OPTION: "-DSM=\"70;75;80;90\""
227227
#BUILD_HPS_BACKEND: 1
228228
#HUGECTR_BACKEND_VER: main
229229
#TRITON_BRANCH: r22.11
230230

231-
# Check Selene busy or not
232-
check_selene_status:
231+
# Check cluster busy or not
232+
check_cluster_status:
233233
extends: .trigger:rules:selene
234234
stage: pre_test
235235
tags:
@@ -239,16 +239,16 @@ check_selene_status:
239239
- docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}"
240240
- docker pull ${CONT}
241241
- RC=0
242-
- docker run -d --rm --name selene_idle_${CI_PIPELINE_ID} ${EXTRA_DOCKER_RUN_ARGS} ${CONT} sleep infinity
243-
- docker exec selene_idle_${CI_PIPELINE_ID} bash -cx "python get_selene_runner_status.py --quota ${SELENE_QUEUE_QUOTA} --token \"${CLUSTER_TOKEN}\" " || RC=$?
242+
- docker run -d --rm --name cluster_idle_${CI_PIPELINE_ID} ${EXTRA_DOCKER_RUN_ARGS} ${CONT} sleep infinity
243+
- docker exec cluster_idle_${CI_PIPELINE_ID} bash -cx "python get_selene_runner_status.py --quota ${SELENE_QUEUE_QUOTA} --token \"${CLUSTER_TOKEN}\" " || RC=$?
244244
- echo "$RC"
245245
- echo "NEW_CI_CONCURRENT_ID=${CI_CONCURRENT_ID}" >> other_param.env
246246
- if [[ $RC == 0 ]]; then
247-
echo "Selene is idle!";
248-
cp ./ci/selene/ci.yml ./test-ci.yml;
247+
echo "Run jobs in draco-oci cluster!";
248+
cp ./ci/draco-oci/ci.yml ./test-ci.yml;
249249
echo "NEW_SBATCH_OTHER_PARAMS=" >> other_param.env;
250250
else
251-
echo "Selene is busy!";
251+
echo "Run jobs in other cluster!";
252252
cp ./ci/dracorno/ci.yml ./test-ci.yml;
253253
echo "NEW_SBATCH_OTHER_PARAMS=--nv-meta ml-model.hugectr --gpus-per-node=8" >> other_param.env;
254254
fi
@@ -268,11 +268,11 @@ trigger_test_pipeline:
268268
stage:
269269
test
270270
needs:
271-
- check_selene_status
271+
- check_cluster_status
272272
trigger:
273273
include:
274274
- artifact: test-ci.yml
275-
job: check_selene_status
275+
job: check_cluster_status
276276
strategy: depend
277277
variables:
278278
PARENT_SOURCE: ${CI_PIPELINE_SOURCE}
@@ -286,103 +286,87 @@ criteo_multi_node:
286286
needs:
287287
- build_train_multi_node
288288
variables:
289-
GPFSFOLDER: $LOGDIR/criteo_multi_node
290-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
291289
CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED
292-
MOUNTS: ${DATASET}:${DATASET_MOUNT}
293-
WALLTIME: "00:15:00"
294-
DGXNNODES: 2
290+
MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT}
291+
CI_SLURM_TIME: "00:15:00"
292+
CI_SLURM_NODES: 2
293+
SLURM_JOB_NUM_NODES: 2
295294
TEST_CMD: ./ci/integration_test/criteo/criteo_multi_node.sub
296295

297296
dlrm_dcnv2_benchmark_8node:
298297
extends: .cluster_test_job_daily
299298
needs:
300299
- build_train_multi_node
301300
variables:
302-
GPFSFOLDER: $LOGDIR/dlrm_dcnv2_benchmark_8node
303-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
304301
CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED
305-
MOUNTS: /lustre/fsw/mlperf/mlperft-dlrm/datasets/criteo_multihot_raw:/data,/lustre/fsw/mlperf/mlperft-dlrm/datasets/criteo_multihot_raw:/data_val
306-
WALLTIME: "00:15:00"
307-
DGXNNODES: 8
302+
MOUNTS: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/dlrm/datasets/criteo_multihot_raw:/data,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/dlrm/datasets/criteo_multihot_raw:/data_val
303+
CI_SLURM_TIME: "02:00:00"
304+
CI_SLURM_NODES: 8
305+
SLURM_JOB_NUM_NODES: 8
308306
TEST_CMD: ./ci/integration_test/dlrm/train_dcnv2_8node.sub
309307

310308
wdl_multi_gpu:
311309
extends: .cluster_test_job_daily # test on selene needs to extend .cluster_test_job
312310
needs:
313311
- build_train_single_node
314312
variables:
315-
GPFSFOLDER: $LOGDIR/wdl_multi_gpu # log dir, usually $LOGDIR + job name
316-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} # should not change
317313
CONT: $TRAIN_IMAGE_VERSIONED # image name
318-
MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${DATASET_MOUNT} # mount
319-
WALLTIME: "00:15:00" # estimate job time. Less time, higher priority
320-
DGXNNODES: 1 # node num
314+
MOUNTS: ${DRACO_OCI_DATASET_NEW_CRITEO}:${DATASET_MOUNT} # mount
315+
CI_SLURM_TIME: "00:15:00" # estimate job time. Less time, higher priority
321316
TEST_CMD: ./ci/integration_test/wdl/wdl_daily.sub
322317

323318
deepfm_multi_gpu:
324319
extends: .cluster_test_job_daily
325320
needs:
326321
- build_train_single_node
327322
variables:
328-
GPFSFOLDER: $LOGDIR/deepfm_multi_gpu
329-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
330323
CONT: $TRAIN_IMAGE_VERSIONED
331-
MOUNTS: ${DATASET}:${DATASET_MOUNT}
332-
WALLTIME: "00:15:00"
333-
DGXNNODES: 1
324+
MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT}
325+
CI_SLURM_TIME: "00:15:00"
334326
TEST_CMD: ./ci/integration_test/deepfm/deepfm_daily.sub
335327

336328
dcn_multi_node:
337329
extends: .cluster_test_job_daily
338330
needs:
339331
- build_train_multi_node
340332
variables:
341-
GPFSFOLDER: $LOGDIR/dcn_multi_node
342-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
343333
CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED
344-
MOUNTS: ${DATASET}:${DATASET_MOUNT}
345-
WALLTIME: "01:00:00"
346-
DGXNNODES: 4 # using 4 node
334+
MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT}
335+
CI_SLURM_TIME: "01:00:00"
336+
CI_SLURM_NODES: 4
337+
SLURM_JOB_NUM_NODES: 4
347338
TEST_CMD: ./ci/integration_test/dcn/dcn_multi_node.sub
348339

349340
py_low_level:
350341
extends: .cluster_test_job_daily
351342
needs:
352343
- build_train_single_node
353344
variables:
354-
GPFSFOLDER: $LOGDIR/py_low_level
355-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
356345
CONT: $TRAIN_IMAGE_VERSIONED
357-
MOUNTS: /raid:/raid,${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT}
358-
WALLTIME: "01:00:00"
359-
DGXNNODES: 1
346+
MOUNTS: /raid:/raid,${DRACO_OCI_DATASET_NEW_CRITEO}:${NEW_CRITEO_MOUNT}
347+
CI_SLURM_TIME: "01:00:00"
360348
TEST_CMD: ./ci/integration_test/py_interface/py_low_level.sub
361349

362350
ebc_single_node:
363351
extends: .cluster_test_job_daily
364352
needs:
365353
- build_train_single_node
366354
variables:
367-
GPFSFOLDER: $LOGDIR/ebc_single_node
368-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
369355
CONT: $TRAIN_IMAGE_VERSIONED
370-
MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${DATASET_MOUNT},/raid:/raid
371-
WALLTIME: "00:45:00"
372-
DGXNNODES: 1
356+
MOUNTS: ${DRACO_OCI_DATASET_NEW_CRITEO}:${DATASET_MOUNT},/raid:/raid
357+
CI_SLURM_TIME: "02:00:00"
373358
TEST_CMD: ./ci/integration_test/ebc/ebc.sub
374359

375360
py_multi_node:
376361
extends: .cluster_test_job_daily
377362
needs:
378363
- build_train_multi_node
379364
variables:
380-
GPFSFOLDER: $LOGDIR/py_multi_node
381-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
382365
CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED
383-
MOUNTS: ${DATASET}:${DATASET_MOUNT}
384-
WALLTIME: "00:15:00"
385-
DGXNNODES: 4
366+
MOUNTS: ${DRACO_OCI_DATASET}:${DATASET_MOUNT}
367+
CI_SLURM_TIME: "00:15:00"
368+
CI_SLURM_NODES: 4
369+
SLURM_JOB_NUM_NODES: 4
386370
TEST_CMD: ./ci/integration_test/py_interface/py_multi_node.sub
387371

388372
inference_benchmark:
@@ -393,13 +377,10 @@ inference_benchmark:
393377
- export BZ=1
394378
- export MIXED_PRECISION=FP32
395379
variables:
396-
GPFSFOLDER: $LOGDIR/inference_benchmark
397-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
398380
CONT: $INFER_IMAGE_VERSIONED
399-
MOUNTS: /lustre/fsw/devtech/hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,/lustre/fsw/devtech/hpc-hugectr/keynote_inference/perf_data:/perf_data
381+
MOUNTS: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/keynote_inference/perf_data:/perf_data
400382
WORKDIR: /workdir
401-
WALLTIME: "00:15:00"
402-
DGXNNODES: 1
383+
CI_SLURM_TIME: "00:15:00"
403384
TEST_CMD: ./ci/benchmark/inference_benchmark/run.sub
404385

405386
inference_ps_test:
@@ -444,36 +425,27 @@ e2e_nvt_regression_test:
444425
needs:
445426
- build_train_single_node_latest
446427
variables:
447-
GPFSFOLDER: $LOGDIR/e2e_nvt_regression_test
448-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
449428
CONT: $TRAIN_IMAGE_VERSIONED_LATEST
450-
MOUNTS: /lustre/fsw/devtech/hpc-hugectr/criteo_1TB/day_1:/workdir/tools/day_1,/lustre/fsw/devtech/hpc-hugectr/inference/nvt_regression:/workdir/samples/din/raw_data,/lustre/fsw/devtech/hpc-hugectr/criteo_1TB/day_0:/dir/to/criteo/day_0
451-
WALLTIME: "01:00:00"
452-
DGXNNODES: 1
429+
MOUNTS: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/criteo_1TB/day_1:/workdir/tools/day_1,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/inference/nvt_regression:/workdir/samples/din/raw_data,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/criteo_1TB/day_0:/dir/to/criteo/day_0
430+
CI_SLURM_TIME: "01:00:00"
453431
TEST_CMD: ./ci/integration_test/nvt/nvt_regression_test.sub
454432

455433
nb_hps_demo:
456434
extends: .cluster_test_job_daily
457435
needs:
458436
- build_train_single_node
459437
variables:
460-
GPFSFOLDER: $LOGDIR/nb_hps_demo
461-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
462438
CONT: $TRAIN_IMAGE_VERSIONED
463-
WALLTIME: "00:15:00"
464-
DGXNNODES: 1
439+
CI_SLURM_TIME: "00:45:00"
465440
TEST_CMD: ./ci/integration_test/notebooks/hps_demo.sub
466441

467442
test_sok_pypi:
468443
extends: .cluster_test_job_daily
469444
needs:
470445
- build_sok_tf2
471446
variables:
472-
GPFSFOLDER: $LOGDIR/test_sok_pypi
473-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
474447
CONT: $SOK_IMAGE_VERSIONED_TF2
475-
WALLTIME: "00:15:00"
476-
DGXNNODES: 1
448+
CI_SLURM_TIME: "00:15:00"
477449
TEST_CMD: ./ci/integration_test/sok/test_sok_pypi.sub
478450

479451
wdl_check:
@@ -482,12 +454,9 @@ wdl_check:
482454
needs:
483455
- wdl_multi_gpu
484456
variables:
485-
GPFSFOLDER: $LOGDIR/wdl_check
486-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
487457
CONT: $TRAIN_IMAGE_VERSIONED
488-
MOUNTS: $LOGDIR/wdl_multi_gpu:/logs
489-
WALLTIME: "00:15:00"
490-
DGXNNODES: 1
458+
MOUNTS: ${DRACO_OCI_LOGDIR}/wdl_multi_gpu:/logs
459+
CI_SLURM_TIME: "00:15:00"
491460
TEST_CMD: ./ci/post_test/check_wdl.sub
492461

493462
inference_benchmark_check:
@@ -496,12 +465,9 @@ inference_benchmark_check:
496465
- inference_benchmark
497466
- build_train_single_node
498467
variables:
499-
GPFSFOLDER: $LOGDIR/inference_benchmark_check
500-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
501468
CONT: $TRAIN_IMAGE_VERSIONED
502-
MOUNTS: $LOGDIR/inference_benchmark:/logs
503-
WALLTIME: "00:15:00"
504-
DGXNNODES: 1
469+
MOUNTS: ${DRACO_OCI_LOGDIR}/inference_benchmark:/logs
470+
CI_SLURM_TIME: "00:15:00"
505471
TEST_CMD: ./ci/post_test/check_inference_benchmark.sub
506472

507473
inference_cpu_memory_usage:
@@ -511,14 +477,12 @@ inference_cpu_memory_usage:
511477
before_script:
512478
- export BZ=1
513479
- export MIXED_PRECISION=FP32
480+
- mkdir -p ${DRACO_OCI_LOGDIR}/inference_cpu_memory
514481
variables:
515-
GPFSFOLDER: $LOGDIR/inference_cpu_memory
516-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
517482
CONT: $INFER_IMAGE_VERSIONED
518-
MOUNTS: /lustre/fsw/devtech/hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,$LOGDIR/inference_cpu_memory:/logs
483+
MOUNTS: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,${DRACO_OCI_LOGDIR}/inference_cpu_memory:/logs
519484
WORKDIR: /workdir
520-
WALLTIME: "00:30:00"
521-
DGXNNODES: 1
485+
CI_SLURM_TIME: "00:30:00"
522486
TEST_CMD: ./ci/benchmark/hps_memory_check/run.sub
523487

524488
inference_CPU_Memory_check:
@@ -527,12 +491,9 @@ inference_CPU_Memory_check:
527491
- inference_cpu_memory_usage
528492
- build_train_single_node
529493
variables:
530-
GPFSFOLDER: $LOGDIR/inference_cpu_memory
531-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
532494
CONT: $TRAIN_IMAGE_VERSIONED
533-
MOUNTS: $LOGDIR/inference_cpu_memory:/logs
534-
WALLTIME: "00:15:00"
535-
DGXNNODES: 1
495+
MOUNTS: ${DRACO_OCI_LOGDIR}/inference_cpu_memory:/logs
496+
CI_SLURM_TIME: "00:15:00"
536497
TEST_CMD: ./ci/post_test/check_cpu_usage.sub
537498

538499
dlrm_dcnv2_8node_check:
@@ -541,12 +502,9 @@ dlrm_dcnv2_8node_check:
541502
needs:
542503
- dlrm_dcnv2_benchmark_8node
543504
variables:
544-
GPFSFOLDER: $LOGDIR/dlrm_dcnv2_8node_check
545-
GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
546505
CONT: $TRAIN_IMAGE_VERSIONED
547-
MOUNTS: $LOGDIR/dlrm_dcnv2_benchmark_8node:/logs
548-
WALLTIME: "00:15:00"
549-
DGXNNODES: 1
506+
MOUNTS: ${DRACO_OCI_LOGDIR}/dlrm_dcnv2_benchmark_8node:/logs
507+
CI_SLURM_TIME: "00:15:00"
550508
TEST_CMD: ./ci/post_test/check_dcnv2_dlrm_8node.sub
551509

552510
# rm_logs:
+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
#!/bin/bash
22

3-
srun --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "bash /workdir/ci/benchmark/hps_plugin_benchmark/test.sh"
3+
srun --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "bash /workdir/ci/benchmark/hps_plugin_benchmark/test.sh"

ci/common.yml

+8
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,11 @@ variables:
5656
DRACO_BST_DATASET: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/bst
5757
DRACO_CLUSTER: "dracorno"
5858
DRACO_LOGDIR: ${DATA_PREFIX}/fs1/projects/gpu_compute/users/svcnvdlfw/hugectr_ci/${PARENT_PIPELINE_ID}
59+
# DRACO-OCI cluster
60+
DRACO_OCI_DATASET: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/criteo_kaggle
61+
DRACO_OCI_DIN_DATASET: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/din
62+
DRACO_OCI_NCF_DATASET: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/movie_len
63+
DRACO_OCI_DATASET_NEW_CRITEO: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/new-criteo-dataset
64+
DRACO_OCI_MMOE_DATASET: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/mmoe_data
65+
DRACO_OCI_BST_DATASET: /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/bst
66+
DRACO_OCI_LOGDIR: /lustre/fsw/portfolios/coreai/users/svcnvdlfw/hugectr_ci/${PARENT_PIPELINE_ID}

0 commit comments

Comments
 (0)