|
1 | 1 | #!/bin/bash
|
2 |
| - |
3 | 2 | set -ex
|
4 | 3 |
|
5 |
| -# Update distro |
6 |
| -apt-get update |
7 |
| - |
8 |
| -# Install required packages |
9 |
| -apt-get install -y curl |
10 |
| - |
11 |
| -# clean up all previously installed library to avoid conflicts |
12 |
| -# while installing Amazon EFA version |
13 |
| -dpkg --purge efa-config efa-profile libfabric openmpi \ |
14 |
| - ibacm ibverbs-providers ibverbs-utils infiniband-diags \ |
15 |
| - libibmad-dev libibmad5 libibnetdisc-dev libibnetdisc5 \ |
16 |
| - libibumad-dev libibumad3 libibverbs-dev libibverbs1 librdmacm-dev \ |
17 |
| - librdmacm1 rdma-core rdmacm-utils |
18 |
| - |
19 |
| -# Download Amazon EFA package and install |
20 |
| -EFA_INSTALLER_VERSION=latest |
21 |
| -WORKDIR=$(mktemp -d) |
22 |
| - |
23 |
| -pushd ${WORKDIR} |
24 |
| - |
25 |
| -AMAZON_EFA_LINK="https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" |
26 |
| -curl -O "$AMAZON_EFA_LINK" |
27 |
| -tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && cd aws-efa-installer |
28 |
| -./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify |
29 |
| - |
| 4 | +EFA_INSTALLER_VERSION=1.34.0 # or: latest |
| 5 | +AWS_OFI_NCCL_PREFIX=/opt/aws-ofi-nccl |
| 6 | +AWS_OFI_NCCL_VERSION=1.11.0 |
| 7 | + |
| 8 | +apt update |
| 9 | + |
| 10 | +EFA_TMP=$(mktemp -d) |
| 11 | +pushd $EFA_TMP |
| 12 | +curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz |
| 13 | +tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz |
| 14 | +cd aws-efa-installer |
| 15 | +rm -v DEBS/UBUNTU2204/x86_64/{libpmix,openmpi,prrte}* # block installation of MPI components |
| 16 | +apt-get purge -y ibverbs-providers libibverbs-dev libibverbs1 libibumad-dev libibumad3 librdmacm1 librdmacm-dev ibverbs-utils |
| 17 | +./efa_installer.sh -g -y --skip-kmod --skip-limit-conf --no-verify |& tee install.log |
| 18 | +mv -v install.log /opt/amazon/efa/install.log |
30 | 19 | popd
|
| 20 | +rm -rf $EFA_TMP |
| 21 | + |
| 22 | +AWS_OFI_NCCL_TMP=$(mktemp -d) |
| 23 | +pushd $AWS_OFI_NCCL_TMP |
| 24 | +apt-get install -y libhwloc-dev |
| 25 | +curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}-aws/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws.tar.gz |
| 26 | +tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws.tar.gz |
| 27 | +cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws |
| 28 | +./configure --prefix=${AWS_OFI_NCCL_PREFIX} --with-libfabric=/opt/amazon/efa --with-cuda=/usr/local/cuda --with-mpi=/usr/local/mpi |
| 29 | +make -j$(nproc) install |
| 30 | +popd |
| 31 | +rm -rf $AWS_OFI_NCCL_TMP |
31 | 32 |
|
32 |
| -# Clean up |
33 |
| -apt-get clean |
34 | 33 | rm -rf /var/lib/apt/lists/*
|
35 |
| -rm -rf ${WORKDIR} |
| 34 | + |
| 35 | +# Ranks higher than HPC-X => newly-installed libnccl-net.so becomes the default |
| 36 | +echo "${AWS_OFI_NCCL_PREFIX}/lib" > /etc/ld.so.conf.d/000_aws_ofi_nccl.conf |
| 37 | +ldconfig |
0 commit comments