Skip to content

Add OSS GPU tests #231

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions .github/scripts/install_nvidia_utils_linux.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env bash

set -eou pipefail


DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID)
DRIVER_VERSION="515.57"
DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"

install_nvidia_docker2_amzn2() {
(
set -x
# Needed for yum-config-manager
sudo yum install -y yum-utils
sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
sudo yum install -y nvidia-docker2
sudo systemctl restart docker
)
}

install_nvidia_driver_amzn2() {
(
set -x

# Purge any nvidia driver installed from RHEL repo
sudo yum remove -y nvidia-driver-latest-dkms

HAS_NVIDIA_DRIVER=0
# Check if NVIDIA driver has already been installed
if [ -x "$(command -v nvidia-smi)" ]; then
# The driver exists, check its version next
INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)

if [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
else
HAS_NVIDIA_DRIVER=1
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
fi
fi

if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
sudo yum groupinstall -y "Development Tools"
# ensure our kernel install is the same as our underlying kernel,
# groupinstall "Development Tools" has a habit of mismatching kernel headers
sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
sudo modprobe backlight
sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
sudo rm -fv /tmp/nvidia_driver
fi

(
set +e
nvidia-smi
status=$?
# Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
if [ $status -eq 0 ] || [ $status -eq 14 ]; then
echo "INFO: Ignoring allowed status ${status}"
else
echo "ERROR: nvidia-smi exited with unresolved status ${status}"
exit ${status}
fi
)
)
}

echo "== Installing nvidia driver ${DRIVER_FN} =="
case "${DISTRIBUTION}" in
amzn*)
install_nvidia_driver_amzn2
;;
*)
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
exit 1
;;
esac

# Install container toolkit based on distribution
echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
case "${DISTRIBUTION}" in
amzn*)
install_nvidia_docker2_amzn2
;;
*)
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
exit 1
;;
esac
20 changes: 17 additions & 3 deletions .github/workflows/runtime_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,27 @@ jobs:
matrix:
python-major-version: [3]
python-minor-version: [7,8,9,10]
platform: [ubuntu-18.04]
platform: [linux.4xlarge.nvidia.gpu]
fail-fast: false
runs-on: ${{ matrix.platform }}
steps:
- name: Checkout MultiPy
uses: actions/checkout@v2
with:
submodules: true

- name: Clean up previous CUDA driver installations
shell: bash
run: |
set -x
yum list installed | grep nvidia || true
yum list installed | grep cuda || true
sudo yum remove -y cuda || true
sudo yum remove -y cuda-drivers || true
sudo yum remove -y "*nvidia*" || true
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
run: |
bash .github/scripts/install_nvidia_utils_linux.sh || true
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
- name: Setup SSH (Click me for login details)
uses: ./.github/actions/setup-ssh
with:
Expand All @@ -30,11 +42,13 @@ jobs:
- name: Build
env:
DOCKER_BUILDKIT: 1
run: docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} .
run: nvidia-docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} --build-arg BUILD_CUDA_TESTS=1 .

- name: Test
run: |
docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy"
nvidia-docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy_gpu"


- name: Examples
run: |
Expand Down
27 changes: 14 additions & 13 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG BASE_IMAGE=nvidia/cuda:11.3.1-devel-ubuntu18.04
ARG BASE_IMAGE=nvidia/cuda:11.6.1-devel-ubuntu18.04

FROM ${BASE_IMAGE} as dev-base

Expand Down Expand Up @@ -59,13 +59,17 @@ COPY .git .git
COPY .gitmodules .gitmodules
COPY multipy multipy
COPY compat-requirements.txt compat-requirements.txt
COPY setup.py setup.py
COPY README.md README.md
COPY dev-requirements.txt dev-requirements.txt

RUN git submodule update --init --recursive --jobs 0

# Install conda/pyenv + necessary python dependencies
FROM dev-base as conda-pyenv
ARG PYTHON_MAJOR_VERSION=3
ARG PYTHON_MINOR_VERSION=8
ARG BUILD_CUDA_TESTS=0
ENV PYTHON_MINOR_VERSION=${PYTHON_MINOR_VERSION}
ENV PYTHON_VERSION=${PYTHON_MAJOR_VERSION}.${PYTHON_MINOR_VERSION}
RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
Expand All @@ -75,7 +79,7 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
rm ~/miniconda.sh && \
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} mkl mkl-include conda-build pyyaml numpy ipython && \
/opt/conda/bin/conda install -y -c conda-forge libpython-static=${PYTHON_VERSION} && \
/opt/conda/bin/conda install -y pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch-nightly && \
/opt/conda/bin/conda install -y pytorch torchvision torchaudio pytorch-cuda=11.6 -c pytorch-nightly -c nvidia && \
/opt/conda/bin/conda clean -ya; \
else \
pip3 install virtualenv && \
Expand All @@ -84,29 +88,26 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
~/.pyenv/bin/pyenv install --force 3.7.10 && \
virtualenv -p ~/.pyenv/versions/3.7.10/bin/python3 ~/venvs/multipy && \
source ~/venvs/multipy/bin/activate && \
pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113; \
pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu116; \
fi

# Build/Install pytorch with post-cxx11 ABI
FROM conda-pyenv as build
WORKDIR /opt/multipy/multipy/runtime/third-party/pytorch
COPY --from=conda-pyenv /opt/conda* /opt/conda
COPY --from=submodule-update /opt/multipy /opt/multipy

WORKDIR /opt/multipy

# Build Multipy
RUN rm -r multipy/runtime/build; mkdir multipy/runtime/build && \
cd multipy/runtime/build && \
RUN ls && pwd && rm -rf multipy/runtime/build && \
if [[ ${PYTHON_MINOR_VERSION} -lt 8 ]]; then \
source ~/venvs/multipy/bin/activate && \
cmake -DLEGACY_PYTHON_PRE_3_8=ON ..; \
source ~/venvs/multipy/bin/activate; \
fi && \
if [[ ${BUILD_CUDA_TESTS} -eq 1 ]]; then \
python -m pip install -e . --install-option="--cudatests"; \
else \
cmake -DLEGACY_PYTHON_PRE_3_8=OFF ..; \
python -m pip install -e .; \
fi && \
cmake --build . --config Release -j && \
cmake --install . --prefix "." && \
cd ../example && python generate_examples.py
python multipy/runtime/example/generate_examples.py

# Build examples
COPY examples examples
Expand Down
19 changes: 15 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@ def get_cmake_version():


class MultipyRuntimeCmake(object):
user_options = [("cmakeoff", None, None), ("abicxx", None, None)]
user_options = [
("cmakeoff", None, None),
("cudatests", None, None),
("abicxx", None, None),
]


class MultipyRuntimeDevelop(MultipyRuntimeCmake, develop):
Expand All @@ -41,24 +45,29 @@ def initialize_options(self):
# TODO(tristanr): remove once unused
self.abicxx = None

self.cudatests = None

def finalize_options(self):
develop.finalize_options(self)
if self.cmakeoff is not None:
self.distribution.get_command_obj("build_ext").cmake_off = True
if self.cudatests is not None:
self.distribution.get_command_obj("build_ext").cuda_tests_flag = "ON"


class MultipyRuntimeBuild(MultipyRuntimeCmake, build_ext):
user_options = build_ext.user_options + MultipyRuntimeCmake.user_options
cmake_off = False
cuda_tests_flag = "OFF"

def run(self):
if self.cmake_off:
return
try:
cmake_version_comps = get_cmake_version().split(".")
if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "19":
if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "12":
raise RuntimeError(
"CMake 3.19 or later required for multipy runtime installation."
"CMake 3.12 or later required for multipy runtime installation."
)
except OSError:
raise RuntimeError(
Expand All @@ -74,7 +83,9 @@ def run(self):
print(f"-- Running multipy runtime makefile in dir {build_dir_abs}")
try:
subprocess.run(
[f"cmake -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."],
[
f"cmake -DBUILD_CUDA_TESTS={self.cuda_tests_flag} -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."
],
cwd=build_dir_abs,
shell=True,
check=True,
Expand Down