Skip to content

Commit e772ffc

Browse files
narendasanNaren Dasan
and
Naren Dasan
authored
fix: Fix the CUDAGraphs C++ runtime implementation (#3067)
Signed-off-by: Naren Dasan <[email protected]> Signed-off-by: Naren Dasan <[email protected]> Co-authored-by: Naren Dasan <[email protected]>
1 parent 03092ba commit e772ffc

28 files changed

+930
-569
lines changed

.github/workflows/build-test-linux.yml

+45-15
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ on:
88
- nightly
99
- release/*
1010
tags:
11-
# NOTE: Binary build pipelines should only get triggered on release candidate builds
12-
# Release candidate tags look like: v1.11.0-rc1
13-
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
11+
# NOTE: Binary build pipelines should only get triggered on release candidate builds
12+
# Release candidate tags look like: v1.11.0-rc1
13+
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
1414
workflow_dispatch:
1515

1616
jobs:
@@ -84,9 +84,9 @@ jobs:
8484
popd
8585
pushd .
8686
cd tests/py/ts
87-
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
88-
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
89-
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
87+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
88+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
89+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
9090
popd
9191
9292
tests-py-dynamo-converters:
@@ -114,7 +114,7 @@ jobs:
114114
export USE_HOST_DEPS=1
115115
pushd .
116116
cd tests/py/dynamo
117-
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 10 conversion/
117+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 8 conversion/
118118
popd
119119
120120
tests-py-dynamo-fe:
@@ -170,7 +170,7 @@ jobs:
170170
export USE_HOST_DEPS=1
171171
pushd .
172172
cd tests/py/dynamo
173-
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
173+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
174174
popd
175175
176176
tests-py-torch-compile-be:
@@ -198,9 +198,9 @@ jobs:
198198
export USE_HOST_DEPS=1
199199
pushd .
200200
cd tests/py/dynamo
201-
python -m pytest -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
202-
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
203-
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
201+
python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
202+
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
203+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
204204
popd
205205
206206
tests-py-dynamo-core:
@@ -228,9 +228,39 @@ jobs:
228228
export USE_HOST_DEPS=1
229229
pushd .
230230
cd tests/py/dynamo
231-
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/
232-
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
233-
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
231+
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
232+
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
233+
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
234+
popd
235+
236+
tests-py-dynamo-cudagraphs:
237+
name: Test dynamo cudagraphs [Python]
238+
needs: [generate-matrix, build]
239+
strategy:
240+
fail-fast: false
241+
matrix:
242+
include:
243+
- repository: pytorch/tensorrt
244+
package-name: torch_tensorrt
245+
pre-script: packaging/pre_build_script.sh
246+
post-script: packaging/post_build_script.sh
247+
smoke-test-script: packaging/smoke_test_script.sh
248+
uses: ./.github/workflows/linux-test.yml
249+
with:
250+
job-name: tests-py-dynamo-cudagraphs
251+
repository: "pytorch/tensorrt"
252+
ref: ""
253+
test-infra-repository: pytorch/test-infra
254+
test-infra-ref: main
255+
build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
256+
pre-script: ${{ matrix.pre-script }}
257+
script: |
258+
export USE_HOST_DEPS=1
259+
pushd .
260+
cd tests/py/dynamo
261+
nvidia-smi
262+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py || true
263+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py || true
234264
popd
235265
236266
tests-py-core:
@@ -258,7 +288,7 @@ jobs:
258288
export USE_HOST_DEPS=1
259289
pushd .
260290
cd tests/py/core
261-
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
291+
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
262292
popd
263293
264294
concurrency:

.github/workflows/build-test-windows.yml

+38-9
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ on:
88
- nightly
99
- release/*
1010
tags:
11-
# NOTE: Binary build pipelines should only get triggered on release candidate builds
12-
# Release candidate tags look like: v1.11.0-rc1
13-
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
11+
# NOTE: Binary build pipelines should only get triggered on release candidate builds
12+
# Release candidate tags look like: v1.11.0-rc1
13+
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
1414
workflow_dispatch:
1515

1616
jobs:
@@ -191,8 +191,8 @@ jobs:
191191
export USE_HOST_DEPS=1
192192
pushd .
193193
cd tests/py/dynamo
194-
python -m pytest -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
195-
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
194+
python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
195+
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
196196
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
197197
popd
198198
@@ -218,9 +218,38 @@ jobs:
218218
export USE_HOST_DEPS=1
219219
pushd .
220220
cd tests/py/dynamo
221-
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/
222-
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
223-
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
221+
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
222+
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
223+
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
224+
popd
225+
226+
tests-py-dynamo-cudagraphs:
227+
name: Test dynamo cudagraphs [Python]
228+
needs: [generate-matrix, build]
229+
strategy:
230+
fail-fast: false
231+
matrix:
232+
include:
233+
- repository: pytorch/tensorrt
234+
package-name: torch_tensorrt
235+
pre-script: packaging/pre_build_script.sh
236+
post-script: packaging/post_build_script.sh
237+
smoke-test-script: packaging/smoke_test_script.sh
238+
uses: ./.github/workflows/linux-test.yml
239+
with:
240+
job-name: tests-py-dynamo-cudagraphs
241+
repository: "pytorch/tensorrt"
242+
ref: ""
243+
test-infra-repository: pytorch/test-infra
244+
test-infra-ref: main
245+
build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
246+
pre-script: ${{ matrix.pre-script }}
247+
script: |
248+
export USE_HOST_DEPS=1
249+
pushd .
250+
cd tests/py/dynamo
251+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py
252+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py
224253
popd
225254
226255
tests-py-core:
@@ -245,7 +274,7 @@ jobs:
245274
export USE_HOST_DEPS=1
246275
pushd .
247276
cd tests/py/core
248-
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
277+
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
249278
popd
250279
251280
concurrency:

core/conversion/var/Var.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ bool Var::isITensorList() {
153153
// Unpack the Var as a List and check if each entry is a custom class since
154154
// ITensors are stored in CustomClassHolder
155155
auto ival_list = ptr_.ivalue->toList();
156-
for (int i = 0; i < ival_list.size(); i++) {
156+
for (size_t i = 0; i < ival_list.size(); i++) {
157157
if (!ival_list.get(i).isCustomClass()) {
158158
return false;
159159
}
@@ -167,7 +167,7 @@ std::vector<nvinfer1::ITensor*> Var::unwrapToITensorList() {
167167
TORCHTRT_CHECK(isITensorList(), "Expected IValue to be an ITensorList");
168168
auto ivalue_list = ptr_.ivalue->toList();
169169
std::vector<nvinfer1::ITensor*> outputs;
170-
for (int i = 0; i < ivalue_list.size(); i++) {
170+
for (size_t i = 0; i < ivalue_list.size(); i++) {
171171
auto element = ivalue_list.get(i).toCustomClass<TensorContainer>()->tensor();
172172
outputs.push_back(std::move(element));
173173
}

core/runtime/TRTEngine.cpp

+2-9
Original file line numberDiff line numberDiff line change
@@ -71,15 +71,6 @@ TRTEngine::TRTEngine(
7171
multi_gpu_device_check();
7272
set_rt_device(device_info);
7373

74-
// Set active stream to non-default stream
75-
auto current_stream = c10::cuda::getCurrentCUDAStream(device_info.id);
76-
if (current_stream == c10::cuda::getDefaultCUDAStream(device_info.id)) {
77-
active_stream = c10::cuda::getStreamFromPool(false, device_info.id);
78-
c10::cuda::setCurrentCUDAStream(active_stream);
79-
} else {
80-
active_stream = current_stream;
81-
}
82-
8374
rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));
8475

8576
name = slugify(mod_name);
@@ -205,6 +196,7 @@ TRTEngine::TRTEngine(
205196
}
206197

207198
TRTEngine::~TRTEngine() {
199+
cudagraph.reset();
208200
trt_engine_profiler.reset();
209201
exec_ctx.reset();
210202
cuda_engine.reset();
@@ -253,6 +245,7 @@ void TRTEngine::set_profiling_paths() {
253245
enqueue_profile_path = std::filesystem::path{profile_path_prefix + "/" + name + "_enqueue_profile.trace"}.string();
254246
trt_engine_profile_path =
255247
std::filesystem::path{profile_path_prefix + "/" + name + "_engine_exectuion_profile.trace"}.string();
248+
cuda_graph_debug_path = std::filesystem::path{profile_path_prefix + "/" + name + "_cudagraph.dot"}.string();
256249
}
257250

258251
std::string TRTEngine::to_str() const {

core/runtime/TRTEngine.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ struct TRTEngine : torch::CustomClassHolder {
7070

7171
// CUDAGraph-Related Functionality
7272
at::cuda::CUDAGraph cudagraph = {};
73-
at::cuda::CUDAStream active_stream = c10::cuda::getDefaultCUDAStream();
73+
at::cuda::CUDAStream engine_stream = c10::cuda::getDefaultCUDAStream();
74+
at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
7475
std::vector<at::Tensor> input_buffers = {};
7576
std::vector<at::Tensor> output_buffers = {};
7677
std::string shape_key;
@@ -89,6 +90,7 @@ struct TRTEngine : torch::CustomClassHolder {
8990
std::string output_profile_path;
9091
std::string enqueue_profile_path;
9192
std::string trt_engine_profile_path;
93+
std::string cuda_graph_debug_path;
9294
std::mutex mu;
9395
std::unique_ptr<TRTEngineProfiler> trt_engine_profiler;
9496
};

0 commit comments

Comments
 (0)