-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathinstall.sh
154 lines (124 loc) · 4.99 KB
/
install.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env bash
set -euxo pipefail
source /opt/gpu/config.sh
source /opt/gpu/package_manager_helpers.sh
trap 'PS4="+ "' exit
PS4='+ $(date -u -I"seconds" | cut -c1-19) '
KERNEL_NAME=$(uname -r)
LOG_FILE_NAME="/var/log/nvidia-installer-$(date +%s).log"
set +euo pipefail
open_devices="$(lsof /dev/nvidia* 2>/dev/null)"
echo "Open devices: $open_devices"
open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)"
echo "Open gridd: $open_gridd"
set -euo pipefail
# install cached nvidia debian packages for container runtime compatibility
install_cached_nvidia_packages() {
for apt_package in $NVIDIA_PACKAGES; do
dpkg -i --force-overwrite /opt/gpu/${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}*
done
}
use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packages 10 3
# blacklist nouveau driver, nvidia driver dependency
cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf
update-initramfs -u
# clean up lingering files from previous install
set +e
umount -l /usr/lib/x86_64-linux-gnu || true
umount -l /tmp/overlay || true
rm -r /tmp/overlay || true
set -e
# set up overlayfs to change install location of nvidia libs from /usr/lib/x86_64-linux-gnu to /usr/local/nvidia
# add an extra layer of indirection via tmpfs because it's not possible to have an overlayfs on an overlayfs (i.e., inside a container)
mkdir /tmp/overlay
mount -t tmpfs tmpfs /tmp/overlay
mkdir /tmp/overlay/{workdir,lib64}
mkdir -p ${GPU_DEST}/lib64
mount -t overlay overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=/tmp/overlay/lib64,workdir=/tmp/overlay/workdir /usr/lib/x86_64-linux-gnu
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}"
elif [[ "${DRIVER_KIND}" == "grid" ]]; then
RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure"
else
echo "Invalid driver kind: ${DRIVER_KIND}"
exit 1
fi
# install nvidia drivers
pushd /opt/gpu
/opt/gpu/${RUNFILE}/nvidia-installer -s -k=$KERNEL_NAME --log-file-name=${LOG_FILE_NAME} -a --no-drm --dkms --utility-prefix="${GPU_DEST}" --opengl-prefix="${GPU_DEST}"
popd
# move nvidia libs to correct location from temporary overlayfs
cp -a /tmp/overlay/lib64 ${GPU_DEST}/lib64
handle_nvidia_systemd_units() {
SYSTEMD_SRC="/usr/lib/nvidia/systemd"
SYSTEMD_DEST="/etc/systemd/system"
moved_units=()
# Check if the source directory exists
if [[ -d "$SYSTEMD_SRC" ]]; then
# Use find to list *.service files in the source directory
found_files=$(find "$SYSTEMD_SRC" -maxdepth 1 -name "*.service")
if [ -z "$found_files" ]; then
echo "No systemd unit files found in $SYSTEMD_SRC"
else
# Loop through each found file, move it, and record its name
while IFS= read -r unit; do
mv "$unit" "$SYSTEMD_DEST/"
unit_name=$(basename "$unit")
moved_units+=("$unit_name")
echo "Moved $unit_name to $SYSTEMD_DEST"
done <<< "$found_files"
fi
else
echo "Source directory $SYSTEMD_SRC does not exist. Skipping systemd unit file move."
fi
# Reload systemd to pick up the moved unit files
systemctl daemon-reload
# Enable and restart only the moved units
for unit_name in "${moved_units[@]}"; do
systemctl enable "$unit_name"
systemctl restart "$unit_name"
echo "$unit_name enabled and restarted."
done
}
# grid starts a daemon that prevents copying binaries
if [ "${DRIVER_KIND}" == "grid" ]; then
systemctl stop nvidia-gridd || true
fi
# move nvidia binaries to /usr/bin...because we like that?
cp -rvT ${GPU_DEST}/bin /usr/bin || true
# restart that daemon, lol
if [ "${DRIVER_KIND}" == "grid" ]; then
systemctl restart nvidia-gridd || true
fi
handle_nvidia_systemd_units
# configure system to know about nvidia lib paths
echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf
ldconfig
# unmount, cleanup
set +e
umount -l /usr/lib/x86_64-linux-gnu
umount /tmp/overlay
rm -r /tmp/overlay
set -e
# validate that nvidia driver is working
dkms status
nvidia-modprobe -u -c0
# configure persistence daemon
# decreases latency for later driver loads
# reduces nvidia-smi invocation time 10x from 30 to 2 sec
# notable on large VM sizes with multiple GPUs
# especially when nvidia-smi process is in CPU cgroup
cp /opt/gpu/nvidia-persistenced.service /etc/systemd/system/nvidia-persistenced.service
systemctl enable nvidia-persistenced.service
systemctl restart nvidia-persistenced.service
nvidia-smi
# install fabricmanager for nvlink based systems
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
bash /opt/gpu/fabricmanager-linux-x86_64-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
fi
mkdir -p /etc/containerd/config.d
cp /opt/gpu/10-nvidia-runtime.toml /etc/containerd/config.d/10-nvidia-runtime.toml
mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)"
cp /opt/gpu/71-nvidia-char-dev.rules /lib/udev/rules.d/71-nvidia-dev-char.rules
/usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all
rm -r /opt/gpu