@@ -221,15 +221,15 @@ build_pytorch_hps_trt_plugin:
221
221
variables :
222
222
FROM_IMAGE : ${IMAGE_PYTORCH}
223
223
DST_IMAGE : $PYTORCH_TRT_IMAGE_VERSIONED
224
- BUILD_TORCH_PLUGIN : 1
224
+ BUILD_TORCH_PLUGIN : 1
225
225
BUILD_TRT_PLUGIN : 1
226
226
TRT_CMAKE_OPTION : " -DSM=\" 70;75;80;90\" "
227
227
# BUILD_HPS_BACKEND: 1
228
228
# HUGECTR_BACKEND_VER: main
229
229
# TRITON_BRANCH: r22.11
230
230
231
- # Check Selene busy or not
232
- check_selene_status :
231
+ # Check cluster busy or not
232
+ check_cluster_status :
233
233
extends : .trigger:rules:selene
234
234
stage : pre_test
235
235
tags :
@@ -239,16 +239,16 @@ check_selene_status:
239
239
- docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}"
240
240
- docker pull ${CONT}
241
241
- RC=0
242
- - docker run -d --rm --name selene_idle_ ${CI_PIPELINE_ID} ${EXTRA_DOCKER_RUN_ARGS} ${CONT} sleep infinity
243
- - docker exec selene_idle_ ${CI_PIPELINE_ID} bash -cx "python get_selene_runner_status.py --quota ${SELENE_QUEUE_QUOTA} --token \"${CLUSTER_TOKEN}\" " || RC=$?
242
+ - docker run -d --rm --name cluster_idle_ ${CI_PIPELINE_ID} ${EXTRA_DOCKER_RUN_ARGS} ${CONT} sleep infinity
243
+ - docker exec cluster_idle_ ${CI_PIPELINE_ID} bash -cx "python get_selene_runner_status.py --quota ${SELENE_QUEUE_QUOTA} --token \"${CLUSTER_TOKEN}\" " || RC=$?
244
244
- echo "$RC"
245
245
- echo "NEW_CI_CONCURRENT_ID=${CI_CONCURRENT_ID}" >> other_param.env
246
246
- if [[ $RC == 0 ]]; then
247
- echo "Selene is idle !";
248
- cp ./ci/selene /ci.yml ./test-ci.yml;
247
+ echo "Run jobs in draco-oci cluster !";
248
+ cp ./ci/draco-oci /ci.yml ./test-ci.yml;
249
249
echo "NEW_SBATCH_OTHER_PARAMS=" >> other_param.env;
250
250
else
251
- echo "Selene is busy !";
251
+ echo "Run jobs in other cluster !";
252
252
cp ./ci/dracorno/ci.yml ./test-ci.yml;
253
253
echo "NEW_SBATCH_OTHER_PARAMS=--nv-meta ml-model.hugectr --gpus-per-node=8" >> other_param.env;
254
254
fi
@@ -268,11 +268,11 @@ trigger_test_pipeline:
268
268
stage :
269
269
test
270
270
needs :
271
- - check_selene_status
271
+ - check_cluster_status
272
272
trigger :
273
273
include :
274
274
- artifact : test-ci.yml
275
- job : check_selene_status
275
+ job : check_cluster_status
276
276
strategy : depend
277
277
variables :
278
278
PARENT_SOURCE : ${CI_PIPELINE_SOURCE}
@@ -286,103 +286,87 @@ criteo_multi_node:
286
286
needs :
287
287
- build_train_multi_node
288
288
variables :
289
- GPFSFOLDER : $LOGDIR/criteo_multi_node
290
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
291
289
CONT : $TRAIN_IMAGE_MULTINODE_VERSIONED
292
- MOUNTS : ${DATASET}:${DATASET_MOUNT}
293
- WALLTIME : " 00:15:00"
294
- DGXNNODES : 2
290
+ MOUNTS : ${DRACO_OCI_DATASET}:${DATASET_MOUNT}
291
+ CI_SLURM_TIME : " 00:15:00"
292
+ CI_SLURM_NODES : 2
293
+ SLURM_JOB_NUM_NODES : 2
295
294
TEST_CMD : ./ci/integration_test/criteo/criteo_multi_node.sub
296
295
297
296
dlrm_dcnv2_benchmark_8node :
298
297
extends : .cluster_test_job_daily
299
298
needs :
300
299
- build_train_multi_node
301
300
variables :
302
- GPFSFOLDER : $LOGDIR/dlrm_dcnv2_benchmark_8node
303
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
304
301
CONT : $TRAIN_IMAGE_MULTINODE_VERSIONED
305
- MOUNTS : /lustre/fsw/mlperf/mlperft-dlrm/datasets/criteo_multihot_raw:/data,/lustre/fsw/mlperf/mlperft-dlrm/datasets/criteo_multihot_raw:/data_val
306
- WALLTIME : " 00:15:00"
307
- DGXNNODES : 8
302
+ MOUNTS : /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/dlrm/datasets/criteo_multihot_raw:/data,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/dlrm/datasets/criteo_multihot_raw:/data_val
303
+ CI_SLURM_TIME : " 02:00:00"
304
+ CI_SLURM_NODES : 8
305
+ SLURM_JOB_NUM_NODES : 8
308
306
TEST_CMD : ./ci/integration_test/dlrm/train_dcnv2_8node.sub
309
307
310
308
wdl_multi_gpu :
311
309
extends : .cluster_test_job_daily # test on selene needs to extend .cluster_test_job
312
310
needs :
313
311
- build_train_single_node
314
312
variables :
315
- GPFSFOLDER : $LOGDIR/wdl_multi_gpu # log dir, usually $LOGDIR + job name
316
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE} # should not change
317
313
CONT : $TRAIN_IMAGE_VERSIONED # image name
318
- MOUNTS : ${DATASET_NEW_CRITEO_SELENE}:${DATASET_MOUNT} # mount
319
- WALLTIME : " 00:15:00" # estimate job time. Less time, higher priority
320
- DGXNNODES : 1 # node num
314
+ MOUNTS : ${DRACO_OCI_DATASET_NEW_CRITEO}:${DATASET_MOUNT} # mount
315
+ CI_SLURM_TIME : " 00:15:00" # estimate job time. Less time, higher priority
321
316
TEST_CMD : ./ci/integration_test/wdl/wdl_daily.sub
322
317
323
318
deepfm_multi_gpu :
324
319
extends : .cluster_test_job_daily
325
320
needs :
326
321
- build_train_single_node
327
322
variables :
328
- GPFSFOLDER : $LOGDIR/deepfm_multi_gpu
329
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
330
323
CONT : $TRAIN_IMAGE_VERSIONED
331
- MOUNTS : ${DATASET}:${DATASET_MOUNT}
332
- WALLTIME : " 00:15:00"
333
- DGXNNODES : 1
324
+ MOUNTS : ${DRACO_OCI_DATASET}:${DATASET_MOUNT}
325
+ CI_SLURM_TIME : " 00:15:00"
334
326
TEST_CMD : ./ci/integration_test/deepfm/deepfm_daily.sub
335
327
336
328
dcn_multi_node :
337
329
extends : .cluster_test_job_daily
338
330
needs :
339
331
- build_train_multi_node
340
332
variables :
341
- GPFSFOLDER : $LOGDIR/dcn_multi_node
342
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
343
333
CONT : $TRAIN_IMAGE_MULTINODE_VERSIONED
344
- MOUNTS : ${DATASET}:${DATASET_MOUNT}
345
- WALLTIME : " 01:00:00"
346
- DGXNNODES : 4 # using 4 node
334
+ MOUNTS : ${DRACO_OCI_DATASET}:${DATASET_MOUNT}
335
+ CI_SLURM_TIME : " 01:00:00"
336
+ CI_SLURM_NODES : 4
337
+ SLURM_JOB_NUM_NODES : 4
347
338
TEST_CMD : ./ci/integration_test/dcn/dcn_multi_node.sub
348
339
349
340
py_low_level :
350
341
extends : .cluster_test_job_daily
351
342
needs :
352
343
- build_train_single_node
353
344
variables :
354
- GPFSFOLDER : $LOGDIR/py_low_level
355
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
356
345
CONT : $TRAIN_IMAGE_VERSIONED
357
- MOUNTS : /raid:/raid,${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT}
358
- WALLTIME : " 01:00:00"
359
- DGXNNODES : 1
346
+ MOUNTS : /raid:/raid,${DRACO_OCI_DATASET_NEW_CRITEO}:${NEW_CRITEO_MOUNT}
347
+ CI_SLURM_TIME : " 01:00:00"
360
348
TEST_CMD : ./ci/integration_test/py_interface/py_low_level.sub
361
349
362
350
ebc_single_node :
363
351
extends : .cluster_test_job_daily
364
352
needs :
365
353
- build_train_single_node
366
354
variables :
367
- GPFSFOLDER : $LOGDIR/ebc_single_node
368
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
369
355
CONT : $TRAIN_IMAGE_VERSIONED
370
- MOUNTS : ${DATASET_NEW_CRITEO_SELENE}:${DATASET_MOUNT},/raid:/raid
371
- WALLTIME : " 00:45:00"
372
- DGXNNODES : 1
356
+ MOUNTS : ${DRACO_OCI_DATASET_NEW_CRITEO}:${DATASET_MOUNT},/raid:/raid
357
+ CI_SLURM_TIME : " 02:00:00"
373
358
TEST_CMD : ./ci/integration_test/ebc/ebc.sub
374
359
375
360
py_multi_node :
376
361
extends : .cluster_test_job_daily
377
362
needs :
378
363
- build_train_multi_node
379
364
variables :
380
- GPFSFOLDER : $LOGDIR/py_multi_node
381
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
382
365
CONT : $TRAIN_IMAGE_MULTINODE_VERSIONED
383
- MOUNTS : ${DATASET}:${DATASET_MOUNT}
384
- WALLTIME : " 00:15:00"
385
- DGXNNODES : 4
366
+ MOUNTS : ${DRACO_OCI_DATASET}:${DATASET_MOUNT}
367
+ CI_SLURM_TIME : " 00:15:00"
368
+ CI_SLURM_NODES : 4
369
+ SLURM_JOB_NUM_NODES : 4
386
370
TEST_CMD : ./ci/integration_test/py_interface/py_multi_node.sub
387
371
388
372
inference_benchmark :
@@ -393,13 +377,10 @@ inference_benchmark:
393
377
- export BZ=1
394
378
- export MIXED_PRECISION=FP32
395
379
variables :
396
- GPFSFOLDER : $LOGDIR/inference_benchmark
397
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
398
380
CONT : $INFER_IMAGE_VERSIONED
399
- MOUNTS : /lustre/fsw/devtech/ hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,/lustre/fsw/devtech /hpc-hugectr/keynote_inference/perf_data:/perf_data
381
+ MOUNTS : /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/ hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr /hpc-hugectr/keynote_inference/perf_data:/perf_data
400
382
WORKDIR : /workdir
401
- WALLTIME : " 00:15:00"
402
- DGXNNODES : 1
383
+ CI_SLURM_TIME : " 00:15:00"
403
384
TEST_CMD : ./ci/benchmark/inference_benchmark/run.sub
404
385
405
386
inference_ps_test :
@@ -444,36 +425,27 @@ e2e_nvt_regression_test:
444
425
needs :
445
426
- build_train_single_node_latest
446
427
variables :
447
- GPFSFOLDER : $LOGDIR/e2e_nvt_regression_test
448
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
449
428
CONT : $TRAIN_IMAGE_VERSIONED_LATEST
450
- MOUNTS : /lustre/fsw/devtech/hpc-hugectr/criteo_1TB/day_1:/workdir/tools/day_1,/lustre/fsw/devtech/hpc-hugectr/inference/nvt_regression:/workdir/samples/din/raw_data,/lustre/fsw/devtech/hpc-hugectr/criteo_1TB/day_0:/dir/to/criteo/day_0
451
- WALLTIME : " 01:00:00"
452
- DGXNNODES : 1
429
+ MOUNTS : /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/criteo_1TB/day_1:/workdir/tools/day_1,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/inference/nvt_regression:/workdir/samples/din/raw_data,/lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/hpc-hugectr/criteo_1TB/day_0:/dir/to/criteo/day_0
430
+ CI_SLURM_TIME : " 01:00:00"
453
431
TEST_CMD : ./ci/integration_test/nvt/nvt_regression_test.sub
454
432
455
433
nb_hps_demo :
456
434
extends : .cluster_test_job_daily
457
435
needs :
458
436
- build_train_single_node
459
437
variables :
460
- GPFSFOLDER : $LOGDIR/nb_hps_demo
461
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
462
438
CONT : $TRAIN_IMAGE_VERSIONED
463
- WALLTIME : " 00:15:00"
464
- DGXNNODES : 1
439
+ CI_SLURM_TIME : " 00:45:00"
465
440
TEST_CMD : ./ci/integration_test/notebooks/hps_demo.sub
466
441
467
442
test_sok_pypi :
468
443
extends : .cluster_test_job_daily
469
444
needs :
470
445
- build_sok_tf2
471
446
variables :
472
- GPFSFOLDER : $LOGDIR/test_sok_pypi
473
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
474
447
CONT : $SOK_IMAGE_VERSIONED_TF2
475
- WALLTIME : " 00:15:00"
476
- DGXNNODES : 1
448
+ CI_SLURM_TIME : " 00:15:00"
477
449
TEST_CMD : ./ci/integration_test/sok/test_sok_pypi.sub
478
450
479
451
wdl_check :
@@ -482,12 +454,9 @@ wdl_check:
482
454
needs :
483
455
- wdl_multi_gpu
484
456
variables :
485
- GPFSFOLDER : $LOGDIR/wdl_check
486
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
487
457
CONT : $TRAIN_IMAGE_VERSIONED
488
- MOUNTS : $LOGDIR/wdl_multi_gpu:/logs
489
- WALLTIME : " 00:15:00"
490
- DGXNNODES : 1
458
+ MOUNTS : ${DRACO_OCI_LOGDIR}/wdl_multi_gpu:/logs
459
+ CI_SLURM_TIME : " 00:15:00"
491
460
TEST_CMD : ./ci/post_test/check_wdl.sub
492
461
493
462
inference_benchmark_check :
@@ -496,12 +465,9 @@ inference_benchmark_check:
496
465
- inference_benchmark
497
466
- build_train_single_node
498
467
variables :
499
- GPFSFOLDER : $LOGDIR/inference_benchmark_check
500
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
501
468
CONT : $TRAIN_IMAGE_VERSIONED
502
- MOUNTS : $LOGDIR/inference_benchmark:/logs
503
- WALLTIME : " 00:15:00"
504
- DGXNNODES : 1
469
+ MOUNTS : ${DRACO_OCI_LOGDIR}/inference_benchmark:/logs
470
+ CI_SLURM_TIME : " 00:15:00"
505
471
TEST_CMD : ./ci/post_test/check_inference_benchmark.sub
506
472
507
473
inference_cpu_memory_usage :
@@ -511,14 +477,12 @@ inference_cpu_memory_usage:
511
477
before_script :
512
478
- export BZ=1
513
479
- export MIXED_PRECISION=FP32
480
+ - mkdir -p ${DRACO_OCI_LOGDIR}/inference_cpu_memory
514
481
variables :
515
- GPFSFOLDER : $LOGDIR/inference_cpu_memory
516
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
517
482
CONT : $INFER_IMAGE_VERSIONED
518
- MOUNTS : /lustre/fsw/devtech/ hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,$LOGDIR /inference_cpu_memory:/logs
483
+ MOUNTS : /lustre/fsw/portfolios/coreai/projects/coreai_devtech_all/hugectr/ hpc-hugectr/inference/dlrm_regression/dlrm/1:/model/dlrm/1,${DRACO_OCI_LOGDIR} /inference_cpu_memory:/logs
519
484
WORKDIR : /workdir
520
- WALLTIME : " 00:30:00"
521
- DGXNNODES : 1
485
+ CI_SLURM_TIME : " 00:30:00"
522
486
TEST_CMD : ./ci/benchmark/hps_memory_check/run.sub
523
487
524
488
inference_CPU_Memory_check :
@@ -527,12 +491,9 @@ inference_CPU_Memory_check:
527
491
- inference_cpu_memory_usage
528
492
- build_train_single_node
529
493
variables :
530
- GPFSFOLDER : $LOGDIR/inference_cpu_memory
531
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
532
494
CONT : $TRAIN_IMAGE_VERSIONED
533
- MOUNTS : $LOGDIR/inference_cpu_memory:/logs
534
- WALLTIME : " 00:15:00"
535
- DGXNNODES : 1
495
+ MOUNTS : ${DRACO_OCI_LOGDIR}/inference_cpu_memory:/logs
496
+ CI_SLURM_TIME : " 00:15:00"
536
497
TEST_CMD : ./ci/post_test/check_cpu_usage.sub
537
498
538
499
dlrm_dcnv2_8node_check :
@@ -541,12 +502,9 @@ dlrm_dcnv2_8node_check:
541
502
needs :
542
503
- dlrm_dcnv2_benchmark_8node
543
504
variables :
544
- GPFSFOLDER : $LOGDIR/dlrm_dcnv2_8node_check
545
- GIT_CLONE_PATH : ${GIT_CLONE_PATH_SELENE}
546
505
CONT : $TRAIN_IMAGE_VERSIONED
547
- MOUNTS : $LOGDIR/dlrm_dcnv2_benchmark_8node:/logs
548
- WALLTIME : " 00:15:00"
549
- DGXNNODES : 1
506
+ MOUNTS : ${DRACO_OCI_LOGDIR}/dlrm_dcnv2_benchmark_8node:/logs
507
+ CI_SLURM_TIME : " 00:15:00"
550
508
TEST_CMD : ./ci/post_test/check_dcnv2_dlrm_8node.sub
551
509
552
510
# rm_logs:
0 commit comments