Skip to content

Commit 277b9ef

Browse files
authored
Update EFA/AWS-OFI-NCCL installation recipe (#1116)
This is not run automatically at present. EFA-enabled containers can be prepared by adding a new `RUN install-efa.sh` layer on top of the published JAX-Toolbox containers.
1 parent 867dc8e commit 277b9ef

File tree

1 file changed

+31
-29
lines changed

1 file changed

+31
-29
lines changed

.github/container/install-efa.sh

+31-29
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,37 @@
11
#!/bin/bash
2-
32
set -ex
43

5-
# Update distro
6-
apt-get update
7-
8-
# Install required packages
9-
apt-get install -y curl
10-
11-
# clean up all previously installed library to avoid conflicts
12-
# while installing Amazon EFA version
13-
dpkg --purge efa-config efa-profile libfabric openmpi \
14-
ibacm ibverbs-providers ibverbs-utils infiniband-diags \
15-
libibmad-dev libibmad5 libibnetdisc-dev libibnetdisc5 \
16-
libibumad-dev libibumad3 libibverbs-dev libibverbs1 librdmacm-dev \
17-
librdmacm1 rdma-core rdmacm-utils
18-
19-
# Download Amazon EFA package and install
20-
EFA_INSTALLER_VERSION=latest
21-
WORKDIR=$(mktemp -d)
22-
23-
pushd ${WORKDIR}
24-
25-
AMAZON_EFA_LINK="https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz"
26-
curl -O "$AMAZON_EFA_LINK"
27-
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && cd aws-efa-installer
28-
./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify
29-
4+
EFA_INSTALLER_VERSION=1.34.0 # or: latest
5+
AWS_OFI_NCCL_PREFIX=/opt/aws-ofi-nccl
6+
AWS_OFI_NCCL_VERSION=1.11.0
7+
8+
apt update
9+
10+
EFA_TMP=$(mktemp -d)
11+
pushd $EFA_TMP
12+
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz
13+
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz
14+
cd aws-efa-installer
15+
rm -v DEBS/UBUNTU2204/x86_64/{libpmix,openmpi,prrte}* # block installation of MPI components
16+
apt-get purge -y ibverbs-providers libibverbs-dev libibverbs1 libibumad-dev libibumad3 librdmacm1 librdmacm-dev ibverbs-utils
17+
./efa_installer.sh -g -y --skip-kmod --skip-limit-conf --no-verify |& tee install.log
18+
mv -v install.log /opt/amazon/efa/install.log
3019
popd
20+
rm -rf $EFA_TMP
21+
22+
AWS_OFI_NCCL_TMP=$(mktemp -d)
23+
pushd $AWS_OFI_NCCL_TMP
24+
apt-get install -y libhwloc-dev
25+
curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}-aws/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws.tar.gz
26+
tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws.tar.gz
27+
cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws
28+
./configure --prefix=${AWS_OFI_NCCL_PREFIX} --with-libfabric=/opt/amazon/efa --with-cuda=/usr/local/cuda --with-mpi=/usr/local/mpi
29+
make -j$(nproc) install
30+
popd
31+
rm -rf $AWS_OFI_NCCL_TMP
3132

32-
# Clean up
33-
apt-get clean
3433
rm -rf /var/lib/apt/lists/*
35-
rm -rf ${WORKDIR}
34+
35+
# Ranks higher than HPC-X => newly-installed libnccl-net.so becomes the default
36+
echo "${AWS_OFI_NCCL_PREFIX}/lib" > /etc/ld.so.conf.d/000_aws_ofi_nccl.conf
37+
ldconfig

0 commit comments

Comments
 (0)