Skip to content

Commit 4ece056

Browse files
atalmanpytorchmergebot
authored andcommitted
Nccl update to 2.25.1 for cuda 12.4-12.8 (pytorch#146073)
Should resolve: pytorch#144768 We use one common nccl version for cuda builds 12.4-12.8 : ``NCCL_VERSION=v2.25.1-1`` For CUDA 11.8 we use legacy ``NCCL_VERSION=v2.21.1-1`` We use pinned version of NCCL rather then submodule. Move nccl location from ``third_party/nccl/nccl`` to ``third_party/nccl`` Pull Request resolved: pytorch#146073 Approved by: https://github.com/Skylion007, https://github.com/malfet, https://github.com/kwen2501, https://github.com/fduwjj
1 parent bd370c1 commit 4ece056

19 files changed

+137
-111
lines changed
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
v2.21.5-1
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
v2.25.1-1

.ci/docker/common/install_base.sh

+4
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,12 @@ install_ubuntu() {
3232

3333
# HACK: UCC testing relies on libnccl library from NVIDIA repo, and version 2.16 crashes
3434
# See https://github.com/pytorch/pytorch/pull/105260#issuecomment-1673399729
35+
# TODO: Eliminate this hack, we should not relay on apt-get installation
36+
# See https://github.com/pytorch/pytorch/issues/144768
3537
if [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "11.8"* ]]; then
3638
maybe_libnccl_dev="libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8 --allow-downgrades --allow-change-held-packages"
39+
elif [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "12.4"* ]]; then
40+
maybe_libnccl_dev="libnccl2=2.25.1-1+cuda12.4 libnccl-dev=2.25.1-1+cuda12.4 --allow-downgrades --allow-change-held-packages"
3741
else
3842
maybe_libnccl_dev=""
3943
fi

.ci/docker/common/install_cuda.sh

+3-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
set -ex
44

5-
NCCL_VERSION=v2.21.5-1
5+
NCCL_VERSION=v2.25.1-1
66
CUDNN_VERSION=9.5.1.17
77

88
function install_cusparselt_040 {
@@ -40,6 +40,7 @@ function install_cusparselt_063 {
4040

4141
function install_118 {
4242
CUDNN_VERSION=9.1.0.70
43+
NCCL_VERSION=v2.21.5-1
4344
echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
4445
rm -rf /usr/local/cuda-11.8 /usr/local/cuda
4546
# install CUDA 11.8.0 in the same container
@@ -288,4 +289,4 @@ do
288289
;;
289290
esac
290291
shift
291-
done
292+
done

.github/scripts/generate_binary_build_matrix.py

+21-33
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969
"nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
7070
"nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | "
7171
"nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
72-
"nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
72+
"nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
7373
"nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
7474
"nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'"
7575
),
@@ -84,7 +84,7 @@
8484
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
8585
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
8686
"nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
87-
"nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
87+
"nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
8888
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
8989
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
9090
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@@ -100,7 +100,7 @@
100100
"nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
101101
"nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | "
102102
"nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
103-
"nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
103+
"nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
104104
"nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
105105
"nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
106106
"nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'"
@@ -117,32 +117,6 @@
117117
}
118118

119119

120-
def get_nccl_submodule_version() -> str:
121-
from pathlib import Path
122-
123-
nccl_version_mk = (
124-
Path(__file__).absolute().parents[2]
125-
/ "third_party"
126-
/ "nccl"
127-
/ "nccl"
128-
/ "makefiles"
129-
/ "version.mk"
130-
)
131-
if not nccl_version_mk.exists():
132-
raise RuntimeError(
133-
"Please make sure that nccl submodule is checked out when importing this script"
134-
)
135-
with nccl_version_mk.open("r") as f:
136-
content = f.read()
137-
d = {}
138-
for l in content.split("\n"):
139-
if not l.startswith("NCCL_"):
140-
continue
141-
(k, v) = l.split(":=")
142-
d[k.strip()] = v.strip()
143-
return f"{d['NCCL_MAJOR']}.{d['NCCL_MINOR']}.{d['NCCL_PATCH']}"
144-
145-
146120
def get_nccl_wheel_version(arch_version: str) -> str:
147121
import re
148122

@@ -154,12 +128,26 @@ def get_nccl_wheel_version(arch_version: str) -> str:
154128
]
155129

156130

131+
def read_nccl_pin(arch_version: str) -> str:
132+
from pathlib import Path
133+
134+
nccl_pin_path = os.path.join(
135+
Path(__file__).absolute().parents[2],
136+
".ci",
137+
"docker",
138+
"ci_commit_pins",
139+
f"nccl-cu{arch_version[:2]}.txt",
140+
)
141+
with open(nccl_pin_path) as f:
142+
return f.read().strip()
143+
144+
157145
def validate_nccl_dep_consistency(arch_version: str) -> None:
146+
nccl_release_tag = read_nccl_pin(arch_version)
158147
wheel_ver = get_nccl_wheel_version(arch_version)
159-
submodule_ver = get_nccl_submodule_version()
160-
if wheel_ver != submodule_ver:
148+
if not nccl_release_tag.startswith(f"v{wheel_ver}"):
161149
raise RuntimeError(
162-
f"NCCL submodule version {submodule_ver} differs from wheel version {wheel_ver}"
150+
f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
163151
)
164152

165153

@@ -356,7 +344,7 @@ def generate_wheels_matrix(
356344
else arch_version
357345
)
358346

359-
# TODO: Enable python 3.13t cpu-s390x or MacOS or Windows
347+
# TODO: Enable python 3.13t on cpu-s390x
360348
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
361349
continue
362350

.github/workflows/build-manywheel-images.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@ on:
1111
# Release candidate tags look like: v1.11.0-rc1
1212
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
1313
paths:
14+
- '.ci/docker/common/*'
1415
- '.ci/docker/manywheel/*'
1516
- '.ci/docker/manywheel/build_scripts/*'
16-
- '.ci/docker/common/*'
1717
- .github/workflows/build-manywheel-images.yml
1818
pull_request:
1919
paths:
20+
- '.ci/docker/common/*'
2021
- '.ci/docker/manywheel/*'
2122
- '.ci/docker/manywheel/build_scripts/*'
22-
- '.ci/docker/common/*'
2323
- .github/workflows/build-manywheel-images.yml
2424

2525

@@ -442,4 +442,4 @@ jobs:
442442
max_attempts: 3
443443
retry_wait_seconds: 90
444444
command: |
445-
.ci/docker/manywheel/build.sh manylinux2_28-builder:xpu
445+
.ci/docker/manywheel/build.sh manylinux2_28-builder:xpu

0 commit comments

Comments
 (0)