Skip to content

Commit 9578bb1

Browse files
authored
TCP Debug Logging
Differential Revision: D70682977 Pull Request resolved: #415
1 parent 1dbfa3d commit 9578bb1

10 files changed

+113
-10
lines changed

gloo/test/tcp_test.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ TEST(TcpTest, ConnectTimeout) {
2525
EXPECT_TRUE(e);
2626
EXPECT_TRUE(dynamic_cast<const TimeoutError*>(&e));
2727
};
28-
connectLoop(loop, remote, timeout, std::move(fn));
28+
connectLoop(loop, remote, 0, 5, timeout, std::move(fn));
2929

3030
std::unique_lock<std::mutex> lock(m);
3131
cv.wait(lock, [&] { return done; });

gloo/transport/tcp/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ else()
55
"${CMAKE_CURRENT_SOURCE_DIR}/address.cc"
66
"${CMAKE_CURRENT_SOURCE_DIR}/buffer.cc"
77
"${CMAKE_CURRENT_SOURCE_DIR}/context.cc"
8+
"${CMAKE_CURRENT_SOURCE_DIR}/debug_logger.cc"
89
"${CMAKE_CURRENT_SOURCE_DIR}/device.cc"
910
"${CMAKE_CURRENT_SOURCE_DIR}/error.cc"
1011
"${CMAKE_CURRENT_SOURCE_DIR}/helpers.cc"
@@ -19,6 +20,8 @@ else()
1920
"${CMAKE_CURRENT_SOURCE_DIR}/attr.h"
2021
"${CMAKE_CURRENT_SOURCE_DIR}/buffer.h"
2122
"${CMAKE_CURRENT_SOURCE_DIR}/context.h"
23+
"${CMAKE_CURRENT_SOURCE_DIR}/debug_data.h"
24+
"${CMAKE_CURRENT_SOURCE_DIR}/debug_logger.h"
2225
"${CMAKE_CURRENT_SOURCE_DIR}/device.h"
2326
"${CMAKE_CURRENT_SOURCE_DIR}/error.h"
2427
"${CMAKE_CURRENT_SOURCE_DIR}/helpers.h"

gloo/transport/tcp/debug_data.h

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2+
3+
#include <string>
4+
#pragma once
5+
6+
namespace gloo {
7+
namespace transport {
8+
namespace tcp {
9+
10+
struct ConnectDebugData {
11+
const int retryCount;
12+
const int retryLimit;
13+
const bool willRetry;
14+
const int glooRank;
15+
const int glooSize;
16+
const std::string error;
17+
const std::string remote;
18+
const std::string local;
19+
};
20+
21+
} // namespace tcp
22+
} // namespace transport
23+
} // namespace gloo

gloo/transport/tcp/debug_logger.cc

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#include <gloo/common/logging.h>
2+
#include <gloo/transport/tcp/debug_logger.h>
3+
4+
namespace gloo {
5+
namespace transport {
6+
namespace tcp {
7+
8+
void DebugLogger::log(const ConnectDebugData& data) {
9+
GLOO_ERROR(
10+
"failed to connect, willRetry=",
11+
data.willRetry,
12+
", retry=",
13+
data.retryCount,
14+
", retryLimit=",
15+
data.retryLimit,
16+
", rank=",
17+
data.glooRank,
18+
", size=",
19+
data.glooSize,
20+
", local=",
21+
data.local,
22+
", remote=",
23+
data.remote,
24+
", error=",
25+
data.error);
26+
}
27+
28+
} // namespace tcp
29+
} // namespace transport
30+
} // namespace gloo

gloo/transport/tcp/debug_logger.h

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2+
3+
#pragma once
4+
5+
#include <gloo/transport/tcp/debug_data.h>
6+
7+
namespace gloo {
8+
namespace transport {
9+
namespace tcp {
10+
11+
class DebugLogger {
12+
public:
13+
static void log(const ConnectDebugData& data);
14+
15+
private:
16+
};
17+
18+
} // namespace tcp
19+
} // namespace transport
20+
} // namespace gloo

gloo/transport/tcp/device.cc

+7-1
Original file line numberDiff line numberDiff line change
@@ -299,12 +299,14 @@ bool Device::isInitiator(const Address& local, const Address& remote) const {
299299
void Device::connect(
300300
const Address& local,
301301
const Address& remote,
302+
const int rank,
303+
const int size,
302304
std::chrono::milliseconds timeout,
303305
connect_callback_t fn) {
304306
auto initiator = isInitiator(local, remote);
305307

306308
if (initiator) {
307-
connectAsInitiator(remote, timeout, std::move(fn));
309+
connectAsInitiator(remote, rank, size, timeout, std::move(fn));
308310
return;
309311
}
310312
connectAsListener(local, timeout, std::move(fn));
@@ -335,6 +337,8 @@ void Device::connectAsListener(
335337
//
336338
void Device::connectAsInitiator(
337339
const Address& remote,
340+
const int rank,
341+
const int size,
338342
std::chrono::milliseconds timeout,
339343
connect_callback_t fn) {
340344
auto writeSeq = [loop = loop_, seq = remote.getSeq()](
@@ -357,6 +361,8 @@ void Device::connectAsInitiator(
357361
connectLoop(
358362
loop_,
359363
remote,
364+
rank,
365+
size,
360366
timeout,
361367
[loop = loop_, fn = std::move(fn), writeSeq = std::move(writeSeq)](
362368
std::shared_ptr<Socket> socket, const Error& error) {

gloo/transport/tcp/device.h

+4
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ class Device : public ::gloo::transport::Device,
9696
void connect(
9797
const Address& local,
9898
const Address& remote,
99+
const int rank,
100+
const int size,
99101
std::chrono::milliseconds timeout,
100102
connect_callback_t fn);
101103

@@ -106,6 +108,8 @@ class Device : public ::gloo::transport::Device,
106108

107109
void connectAsInitiator(
108110
const Address& remote,
111+
const int rank,
112+
const int size,
109113
std::chrono::milliseconds timeout,
110114
connect_callback_t fn);
111115

gloo/transport/tcp/helpers.cc

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@ namespace tcp {
77
void connectLoop(
88
std::shared_ptr<Loop> loop,
99
const Address& remote,
10+
const int rank,
11+
const int size,
1012
std::chrono::milliseconds timeout,
1113
typename ConnectOperation::callback_t fn) {
1214
auto x = std::make_shared<ConnectOperation>(
13-
std::move(loop), remote, timeout, std::move(fn));
15+
std::move(loop), remote, rank, size, timeout, std::move(fn));
1416
x->run();
1517
}
1618

gloo/transport/tcp/helpers.h

+20-7
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
#include <memory>
1313

1414
#include <gloo/common/logging.h>
15+
#include <gloo/transport/tcp/debug_data.h>
1516
#include <gloo/transport/tcp/error.h>
1617
#include <gloo/transport/tcp/loop.h>
1718
#include <gloo/transport/tcp/socket.h>
19+
#include "gloo/transport/tcp/debug_logger.h" // @manual=//gloo:debug_logger
1820

1921
namespace gloo {
2022
namespace transport {
@@ -180,9 +182,13 @@ class ConnectOperation final
180182
ConnectOperation(
181183
std::shared_ptr<Loop> loop,
182184
const Address& remote,
185+
const int rank,
186+
const int size,
183187
std::chrono::milliseconds timeout,
184188
callback_t fn)
185189
: remote_(remote),
190+
rank_(rank),
191+
size_(size),
186192
deadline_(std::chrono::steady_clock::now() + timeout),
187193
loop_(std::move(loop)),
188194
fn_(std::move(fn)) {}
@@ -230,15 +236,18 @@ class ConnectOperation final
230236
SystemError e("SO_ERROR", result, remote_);
231237
bool willRetry = std::chrono::steady_clock::now() < deadline_ &&
232238
retry_++ < maxRetries_;
233-
GLOO_ERROR(
234-
"failed to connect, willRetry=",
235-
willRetry,
236-
", retry=",
239+
240+
auto debugData = ConnectDebugData{
237241
retry_,
238-
", remote=",
242+
maxRetries_,
243+
willRetry,
244+
rank_,
245+
size_,
246+
e.what(),
239247
remote_.str(),
240-
", error=",
241-
e.what());
248+
socket_->sockName().str(),
249+
};
250+
DebugLogger::log(debugData);
242251
// check deadline
243252
if (willRetry) {
244253
run();
@@ -253,6 +262,8 @@ class ConnectOperation final
253262

254263
private:
255264
const Address remote_;
265+
const int rank_;
266+
const int size_;
256267
const std::chrono::time_point<std::chrono::steady_clock> deadline_;
257268
const int maxRetries_{3};
258269

@@ -269,6 +280,8 @@ class ConnectOperation final
269280
void connectLoop(
270281
std::shared_ptr<Loop> loop,
271282
const Address& remote,
283+
const int rank,
284+
const int size,
272285
std::chrono::milliseconds timeout,
273286
typename ConnectOperation::callback_t fn);
274287

gloo/transport/tcp/pair.cc

+2
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ void Pair::connect(const std::vector<char>& bytes) {
123123
device_->connect(
124124
self_,
125125
peer,
126+
context_->rank,
127+
context_->size,
126128
timeout_,
127129
std::bind(
128130
&Pair::connectCallback,

0 commit comments

Comments
 (0)