@@ -22,7 +22,7 @@ internal class SiloHealthMonitor : ITestAccessor, IHealthCheckable, IDisposable,
22
22
private readonly IOptionsMonitor < ClusterMembershipOptions > _clusterMembershipOptions ;
23
23
private readonly IRemoteSiloProber _prober ;
24
24
private readonly ILocalSiloHealthMonitor _localSiloHealthMonitor ;
25
- private readonly IClusterMembershipService _membershipService ;
25
+ private readonly MembershipTableManager _membershipService ;
26
26
private readonly ILocalSiloDetails _localSiloDetails ;
27
27
private readonly CancellationTokenSource _stoppingCancellation = new ( ) ;
28
28
private readonly object _lockObj = new ( ) ;
@@ -59,10 +59,10 @@ public SiloHealthMonitor(
59
59
IRemoteSiloProber remoteSiloProber ,
60
60
IAsyncTimerFactory asyncTimerFactory ,
61
61
ILocalSiloHealthMonitor localSiloHealthMonitor ,
62
- IClusterMembershipService membershipService ,
62
+ MembershipTableManager membershipService ,
63
63
ILocalSiloDetails localSiloDetails )
64
64
{
65
- SiloAddress = siloAddress ;
65
+ TargetSiloAddress = siloAddress ;
66
66
_clusterMembershipOptions = clusterMembershipOptions ;
67
67
_prober = remoteSiloProber ;
68
68
_localSiloHealthMonitor = localSiloHealthMonitor ;
@@ -84,7 +84,7 @@ internal interface ITestAccessor
84
84
/// <summary>
85
85
/// The silo which this instance is responsible for.
86
86
/// </summary>
87
- public SiloAddress SiloAddress { get ; }
87
+ public SiloAddress TargetSiloAddress { get ; }
88
88
89
89
/// <summary>
90
90
/// Whether or not this monitor is canceled.
@@ -153,28 +153,33 @@ public async ValueTask DisposeAsync()
153
153
154
154
private async Task Run ( )
155
155
{
156
- ClusterMembershipSnapshot ? activeMembersSnapshot = default ;
156
+ MembershipTableSnapshot ? activeMembersSnapshot = default ;
157
157
SiloAddress [ ] ? otherNodes = default ;
158
- TimeSpan ? overrideDelay = RandomTimeSpan . Next ( _clusterMembershipOptions . CurrentValue . ProbeTimeout ) ;
158
+ var options = _clusterMembershipOptions . CurrentValue ;
159
+ TimeSpan ? overrideDelay = RandomTimeSpan . Next ( options . ProbeTimeout ) ;
159
160
while ( await _pingTimer . NextTick ( overrideDelay ) )
160
161
{
161
162
ProbeResult probeResult ;
162
163
overrideDelay = default ;
164
+ var now = DateTime . UtcNow ;
163
165
164
166
try
165
167
{
166
168
// Discover the other active nodes in the cluster, if there are any.
167
- var membershipSnapshot = _membershipService . CurrentSnapshot ;
169
+ var membershipSnapshot = _membershipService . MembershipTableSnapshot ;
168
170
if ( otherNodes is null || ! ReferenceEquals ( activeMembersSnapshot , membershipSnapshot ) )
169
171
{
170
172
activeMembersSnapshot = membershipSnapshot ;
171
- otherNodes = membershipSnapshot . Members . Values
172
- . Where ( v => v . Status == SiloStatus . Active && v . SiloAddress != this . SiloAddress && v . SiloAddress != _localSiloDetails . SiloAddress )
173
+ otherNodes = membershipSnapshot . Entries . Values
174
+ . Where ( v => v . Status == SiloStatus . Active
175
+ && ! v . SiloAddress . Equals ( TargetSiloAddress )
176
+ && ! v . SiloAddress . Equals ( _localSiloDetails . SiloAddress )
177
+ && ! v . HasMissedIAmAlives ( options , now ) )
173
178
. Select ( s => s . SiloAddress )
174
179
. ToArray ( ) ;
175
180
}
176
181
177
- var isDirectProbe = ! _clusterMembershipOptions . CurrentValue . EnableIndirectProbes || _failedProbes < _clusterMembershipOptions . CurrentValue . NumMissedProbesLimit - 1 || otherNodes . Length == 0 ;
182
+ var isDirectProbe = ! options . EnableIndirectProbes || _failedProbes < options . NumMissedProbesLimit - 1 || otherNodes . Length == 0 ;
178
183
var timeout = GetTimeout ( isDirectProbe ) ;
179
184
using var cancellation = new CancellationTokenSource ( timeout ) ;
180
185
@@ -198,7 +203,7 @@ private async Task Run()
198
203
if ( probeResult . Status != ProbeResultStatus . Succeeded && probeResult . IntermediaryHealthDegradationScore > 0 )
199
204
{
200
205
_log . LogInformation ( "Recusing unhealthy intermediary '{Intermediary}' and trying again with remaining nodes" , intermediary ) ;
201
- otherNodes = otherNodes . Where ( node => ! node . Equals ( intermediary ) ) . ToArray ( ) ;
206
+ otherNodes = [ .. otherNodes . Where ( node => ! node . Equals ( intermediary ) ) ] ;
202
207
overrideDelay = TimeSpan . FromMilliseconds ( 250 ) ;
203
208
}
204
209
}
@@ -210,15 +215,15 @@ private async Task Run()
210
215
}
211
216
catch ( Exception exception )
212
217
{
213
- _log . LogError ( exception , "Exception monitoring silo {SiloAddress}" , SiloAddress ) ;
218
+ _log . LogError ( exception , "Exception monitoring silo {SiloAddress}" , TargetSiloAddress ) ;
214
219
}
215
220
}
216
221
217
222
TimeSpan GetTimeout ( bool isDirectProbe )
218
223
{
219
224
var additionalTimeout = 0 ;
220
225
221
- if ( _clusterMembershipOptions . CurrentValue . ExtendProbeTimeoutDuringDegradation )
226
+ if ( options . ExtendProbeTimeoutDuringDegradation )
222
227
{
223
228
// Attempt to account for local health degradation by extending the timeout period.
224
229
var localDegradationScore = _localSiloHealthMonitor . GetLocalHealthDegradationScore ( DateTime . UtcNow ) ;
@@ -231,7 +236,7 @@ TimeSpan GetTimeout(bool isDirectProbe)
231
236
additionalTimeout += 1 ;
232
237
}
233
238
234
- return _clusterMembershipOptions . CurrentValue . ProbeTimeout . Multiply ( 1 + additionalTimeout ) ;
239
+ return options . ProbeTimeout . Multiply ( 1 + additionalTimeout ) ;
235
240
}
236
241
}
237
242
@@ -245,15 +250,15 @@ private async Task<ProbeResult> ProbeDirectly(CancellationToken cancellation)
245
250
var id = ++ _nextProbeId ;
246
251
if ( _log . IsEnabled ( LogLevel . Trace ) )
247
252
{
248
- _log . LogTrace ( "Going to send Ping #{Id} to probe silo {Silo}" , id , SiloAddress ) ;
253
+ _log . LogTrace ( "Going to send Ping #{Id} to probe silo {Silo}" , id , TargetSiloAddress ) ;
249
254
}
250
255
251
256
var roundTripTimer = ValueStopwatch . StartNew ( ) ;
252
257
ProbeResult probeResult ;
253
258
Exception ? failureException ;
254
259
try
255
260
{
256
- await _prober . Probe ( SiloAddress , id , cancellation ) . WaitAsync ( cancellation ) ;
261
+ await _prober . Probe ( TargetSiloAddress , id , cancellation ) . WaitAsync ( cancellation ) ;
257
262
failureException = null ;
258
263
}
259
264
catch ( OperationCanceledException exception )
@@ -271,14 +276,14 @@ private async Task<ProbeResult> ProbeDirectly(CancellationToken cancellation)
271
276
272
277
if ( failureException is null )
273
278
{
274
- MessagingInstruments . OnPingReplyReceived ( SiloAddress ) ;
279
+ MessagingInstruments . OnPingReplyReceived ( TargetSiloAddress ) ;
275
280
276
281
if ( _log . IsEnabled ( LogLevel . Trace ) )
277
282
{
278
283
_log . LogTrace (
279
284
"Got successful ping response for ping #{Id} from {Silo} with round trip time of {RoundTripTime}" ,
280
285
id ,
281
- SiloAddress ,
286
+ TargetSiloAddress ,
282
287
roundTripTimer . Elapsed ) ;
283
288
}
284
289
@@ -289,15 +294,15 @@ private async Task<ProbeResult> ProbeDirectly(CancellationToken cancellation)
289
294
}
290
295
else
291
296
{
292
- MessagingInstruments . OnPingReplyMissed ( SiloAddress ) ;
297
+ MessagingInstruments . OnPingReplyMissed ( TargetSiloAddress ) ;
293
298
294
299
var failedProbes = ++ _failedProbes ;
295
300
_log . LogWarning (
296
301
( int ) ErrorCode . MembershipMissedPing ,
297
302
failureException ,
298
303
"Did not get response for probe #{Id} to silo {Silo} after {Elapsed}. Current number of consecutive failed probes is {FailedProbeCount}" ,
299
304
id ,
300
- SiloAddress ,
305
+ TargetSiloAddress ,
301
306
roundTripTimer . Elapsed ,
302
307
failedProbes ) ;
303
308
@@ -319,15 +324,15 @@ private async Task<ProbeResult> ProbeIndirectly(SiloAddress intermediary, TimeSp
319
324
var id = ++ _nextProbeId ;
320
325
if ( _log . IsEnabled ( LogLevel . Trace ) )
321
326
{
322
- _log . LogTrace ( "Going to send indirect ping #{Id} to probe silo {Silo} via {Intermediary}" , id , SiloAddress , intermediary ) ;
327
+ _log . LogTrace ( "Going to send indirect ping #{Id} to probe silo {Silo} via {Intermediary}" , id , TargetSiloAddress , intermediary ) ;
323
328
}
324
329
325
330
var roundTripTimer = ValueStopwatch . StartNew ( ) ;
326
331
ProbeResult probeResult ;
327
332
try
328
333
{
329
334
using var cancellationSource = CancellationTokenSource . CreateLinkedTokenSource ( cancellation , _stoppingCancellation . Token ) ;
330
- var indirectResult = await _prober . ProbeIndirectly ( intermediary , SiloAddress , directProbeTimeout , id , cancellationSource . Token ) . WaitAsync ( cancellationSource . Token ) ;
335
+ var indirectResult = await _prober . ProbeIndirectly ( intermediary , TargetSiloAddress , directProbeTimeout , id , cancellationSource . Token ) . WaitAsync ( cancellationSource . Token ) ;
331
336
roundTripTimer . Stop ( ) ;
332
337
var roundTripTime = roundTripTimer . Elapsed - indirectResult . ProbeResponseTime ;
333
338
@@ -340,26 +345,26 @@ private async Task<ProbeResult> ProbeIndirectly(SiloAddress intermediary, TimeSp
340
345
_log . LogInformation (
341
346
"Indirect probe request #{Id} to silo {SiloAddress} via silo {IntermediarySiloAddress} succeeded after {RoundTripTime} with a direct probe response time of {ProbeResponseTime}." ,
342
347
id ,
343
- SiloAddress ,
348
+ TargetSiloAddress ,
344
349
intermediary ,
345
350
roundTripTimer . Elapsed ,
346
351
indirectResult . ProbeResponseTime ) ;
347
352
348
- MessagingInstruments . OnPingReplyReceived ( SiloAddress ) ;
353
+ MessagingInstruments . OnPingReplyReceived ( TargetSiloAddress ) ;
349
354
350
355
_failedProbes = 0 ;
351
356
probeResult = ProbeResult . CreateIndirect ( 0 , ProbeResultStatus . Succeeded , indirectResult , intermediary ) ;
352
357
}
353
358
else
354
359
{
355
- MessagingInstruments . OnPingReplyMissed ( SiloAddress ) ;
360
+ MessagingInstruments . OnPingReplyMissed ( TargetSiloAddress ) ;
356
361
357
362
if ( indirectResult . IntermediaryHealthScore > 0 )
358
363
{
359
364
_log . LogInformation (
360
365
"Ignoring failure result for ping #{Id} from {Silo} since the intermediary used to probe the silo is not healthy. Intermediary health degradation score: {IntermediaryHealthScore}." ,
361
366
id ,
362
- SiloAddress ,
367
+ TargetSiloAddress ,
363
368
indirectResult . IntermediaryHealthScore ) ;
364
369
probeResult = ProbeResult . CreateIndirect ( _failedProbes , ProbeResultStatus . Unknown , indirectResult , intermediary ) ;
365
370
}
@@ -368,7 +373,7 @@ private async Task<ProbeResult> ProbeIndirectly(SiloAddress intermediary, TimeSp
368
373
_log . LogWarning (
369
374
"Indirect probe request #{Id} to silo {SiloAddress} via silo {IntermediarySiloAddress} failed after {RoundTripTime} with a direct probe response time of {ProbeResponseTime}. Failure message: {FailureMessage}. Intermediary health score: {IntermediaryHealthScore}." ,
370
375
id ,
371
- SiloAddress ,
376
+ TargetSiloAddress ,
372
377
intermediary ,
373
378
roundTripTimer . Elapsed ,
374
379
indirectResult . ProbeResponseTime ,
@@ -382,7 +387,7 @@ private async Task<ProbeResult> ProbeIndirectly(SiloAddress intermediary, TimeSp
382
387
}
383
388
catch ( Exception exception )
384
389
{
385
- MessagingInstruments . OnPingReplyMissed ( SiloAddress ) ;
390
+ MessagingInstruments . OnPingReplyMissed ( TargetSiloAddress ) ;
386
391
_log . LogWarning ( exception , "Indirect probe request failed." ) ;
387
392
probeResult = ProbeResult . CreateIndirect ( _failedProbes , ProbeResultStatus . Unknown , default , intermediary ) ;
388
393
}
0 commit comments