forked from NVIDIA/nccl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdebug.cc
209 lines (193 loc) · 7.23 KB
/
debug.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "nccl_net.h"
#include <stdlib.h>
#include <stdarg.h>
#include <sys/syscall.h>
int ncclDebugLevel = -1;
static int pid = -1;
static char hostname[1024];
thread_local int ncclDebugNoWarn = 0;
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
FILE *ncclDebugFile = stdout;
pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
std::chrono::steady_clock::time_point ncclEpoch;
static __thread int tid = -1;
void ncclDebugInit() {
pthread_mutex_lock(&ncclDebugLock);
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
const char* nccl_debug = getenv("NCCL_DEBUG");
int tempNcclDebugLevel = -1;
if (nccl_debug == NULL) {
tempNcclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
tempNcclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
tempNcclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
tempNcclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
tempNcclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
tempNcclDebugLevel = NCCL_LOG_TRACE;
}
/* Parse the NCCL_DEBUG_SUBSYS env var
* This can be a comma separated list such as INIT,COLL
* or ^INIT,COLL etc
*/
char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
if (ncclDebugSubsysEnv != NULL) {
int invert = 0;
if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
ncclDebugMask = invert ? ~0ULL : 0ULL;
char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
char *subsys = strtok(ncclDebugSubsys, ",");
while (subsys != NULL) {
uint64_t mask = 0;
if (strcasecmp(subsys, "INIT") == 0) {
mask = NCCL_INIT;
} else if (strcasecmp(subsys, "COLL") == 0) {
mask = NCCL_COLL;
} else if (strcasecmp(subsys, "P2P") == 0) {
mask = NCCL_P2P;
} else if (strcasecmp(subsys, "SHM") == 0) {
mask = NCCL_SHM;
} else if (strcasecmp(subsys, "NET") == 0) {
mask = NCCL_NET;
} else if (strcasecmp(subsys, "GRAPH") == 0) {
mask = NCCL_GRAPH;
} else if (strcasecmp(subsys, "TUNING") == 0) {
mask = NCCL_TUNING;
} else if (strcasecmp(subsys, "ENV") == 0) {
mask = NCCL_ENV;
} else if (strcasecmp(subsys, "ALLOC") == 0) {
mask = NCCL_ALLOC;
} else if (strcasecmp(subsys, "CALL") == 0) {
mask = NCCL_CALL;
} else if (strcasecmp(subsys, "PROXY") == 0) {
mask = NCCL_PROXY;
} else if (strcasecmp(subsys, "NVLS") == 0) {
mask = NCCL_NVLS;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
if (mask) {
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
}
subsys = strtok(NULL, ",");
}
free(ncclDebugSubsys);
}
// Cache pid and hostname
getHostName(hostname, 1024, '.');
pid = getpid();
/* Parse and expand the NCCL_DEBUG_FILE path and
* then create the debug file. But don't bother unless the
* NCCL_DEBUG level is > VERSION
*/
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
int c = 0;
char debugFn[PATH_MAX+1] = "";
char *dfn = debugFn;
while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
if (ncclDebugFileEnv[c++] != '%') {
*dfn++ = ncclDebugFileEnv[c-1];
continue;
}
switch (ncclDebugFileEnv[c++]) {
case '%': // Double %
*dfn++ = '%';
break;
case 'h': // %h = hostname
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid
dfn += snprintf(dfn, PATH_MAX, "%d", pid);
break;
default: // Echo everything we don't understand
*dfn++ = '%';
*dfn++ = ncclDebugFileEnv[c-1];
break;
}
}
*dfn = '\0';
if (debugFn[0] != '\0') {
FILE *file = fopen(debugFn, "w");
if (file != nullptr) {
setbuf(file, nullptr); // disable buffering
ncclDebugFile = file;
}
}
}
ncclEpoch = std::chrono::steady_clock::now();
__atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
pthread_mutex_unlock(&ncclDebugLock);
}
/* Common logging function used by the INFO, WARN and TRACE macros
* Also exported to the dynamically loadable Net transport modules so
* they can share the debugging mechanisms and output files
*/
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit();
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
// Save the last error (WARN) as a human readable string
if (level == NCCL_LOG_WARN) {
pthread_mutex_lock(&ncclDebugLock);
va_list vargs;
va_start(vargs, fmt);
(void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
va_end(vargs);
pthread_mutex_unlock(&ncclDebugLock);
}
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
if (tid == -1) {
tid = syscall(SYS_gettid);
}
int cudaDev;
if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
cudaGetDevice(&cudaDev);
}
char buffer[1024];
size_t len = 0;
if (level == NCCL_LOG_WARN) {
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
hostname, pid, tid, cudaDev, filefunc, line);
} else if (level == NCCL_LOG_INFO) {
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
} else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
} else if (level == NCCL_LOG_TRACE) {
auto delta = std::chrono::steady_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
hostname, pid, tid, cudaDev, timestamp, filefunc, line);
}
if (len) {
va_list vargs;
va_start(vargs, fmt);
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
buffer[len++] = '\n';
fwrite(buffer, 1, len, ncclDebugFile);
}
}
NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
// pthread_setname_np is nonstandard GNU extension
// needs the following feature test macro
#ifdef _GNU_SOURCE
if (ncclParamSetThreadName() != 1) return;
char threadName[NCCL_THREAD_NAMELEN];
va_list vargs;
va_start(vargs, fmt);
vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs);
va_end(vargs);
pthread_setname_np(thread, threadName);
#endif
}