@@ -2354,8 +2354,8 @@ func TestChangefeedSchemaChangeNoBackfill(t *testing.T) {
2354
2354
}
2355
2355
}
2356
2356
2357
- // Test checkpointing when the highwater does not move due to some issues with
2358
- // specific spans lagging behind
2357
+ // TestChangefeedLaggingSpanCheckpointing tests checkpointing when the highwater
2358
+ // does not advance due to specific spans lagging behind.
2359
2359
func TestChangefeedLaggingSpanCheckpointing (t * testing.T ) {
2360
2360
defer leaktest .AfterTest (t )()
2361
2361
defer log .Scope (t ).Close (t )
@@ -2369,7 +2369,7 @@ func TestChangefeedLaggingSpanCheckpointing(t *testing.T) {
2369
2369
DistSQL .(* execinfra.TestingKnobs ).
2370
2370
Changefeed .(* TestingKnobs )
2371
2371
2372
- // Initialize table with multiple ranges.
2372
+ // Initialize table with 20 ranges.
2373
2373
sqlDB .Exec (t , `
2374
2374
CREATE TABLE foo (key INT PRIMARY KEY);
2375
2375
INSERT INTO foo (key) SELECT * FROM generate_series(1, 1000);
@@ -2381,28 +2381,39 @@ func TestChangefeedLaggingSpanCheckpointing(t *testing.T) {
2381
2381
changefeedbase .SpanCheckpointInterval .Override (
2382
2382
context .Background (), & s .ClusterSettings ().SV , 10 * time .Millisecond )
2383
2383
changefeedbase .SpanCheckpointMaxBytes .Override (
2384
- context .Background (), & s .ClusterSettings ().SV , 100 << 20 )
2384
+ context .Background (), & s .ClusterSettings ().SV , 100 << 20 /* 100 MiB */ )
2385
2385
changefeedbase .SpanCheckpointLagThreshold .Override (
2386
2386
context .Background (), & s .ClusterSettings ().SV , 10 * time .Millisecond )
2387
2387
2388
- // We'll start changefeed with the cursor.
2388
+ // We'll start the changefeed with the cursor set to the current time (not insert time).
2389
+ // NB: The changefeed created in this test doesn't actually send any message events.
2389
2390
var tsStr string
2390
2391
sqlDB .QueryRow (t , `SELECT cluster_logical_timestamp() from foo` ).Scan (& tsStr )
2391
2392
cursor := parseTimeToHLC (t , tsStr )
2393
+ t .Logf ("cursor: %v" , cursor )
2392
2394
2393
2395
// Rangefeed will skip some of the checkpoints to simulate lagging spans.
2394
2396
var laggingSpans roachpb.SpanGroup
2395
- numLagging := 0
2397
+ nonLaggingSpans := make (map [string ]int )
2398
+ var numLagging , numNonLagging int
2396
2399
knobs .FeedKnobs .ShouldSkipCheckpoint = func (checkpoint * kvpb.RangeFeedCheckpoint ) bool {
2397
- // Skip spans that were skipped before; otherwise skip some spans.
2398
- seenBefore := laggingSpans .Encloses (checkpoint .Span )
2399
- if seenBefore || (numLagging < 5 && rnd .Int ()% 3 == 0 ) {
2400
- if ! seenBefore {
2401
- laggingSpans .Add (checkpoint .Span )
2402
- numLagging ++
2403
- }
2400
+ // Skip spans that we already picked to be lagging.
2401
+ if laggingSpans .Encloses (checkpoint .Span ) {
2402
+ return true /* skip */
2403
+ }
2404
+ // Skip additional updates for some non-lagging spans so that we can
2405
+ // have more than one timestamp in the checkpoint.
2406
+ if i , ok := nonLaggingSpans [checkpoint .Span .String ()]; ok {
2407
+ return i % 3 == 0
2408
+ }
2409
+ // Ensure we have a few spans that are lagging at the cursor.
2410
+ if numLagging == 0 || (numLagging < 5 && rnd .Int ()% 3 == 0 ) {
2411
+ laggingSpans .Add (checkpoint .Span )
2412
+ numLagging ++
2404
2413
return true /* skip */
2405
2414
}
2415
+ nonLaggingSpans [checkpoint .Span .String ()] = numNonLagging
2416
+ numNonLagging ++
2406
2417
return false
2407
2418
}
2408
2419
@@ -2419,64 +2430,83 @@ func TestChangefeedLaggingSpanCheckpointing(t *testing.T) {
2419
2430
return job .Progress ()
2420
2431
}
2421
2432
2422
- // Should eventually checkpoint all spans around the lagging span
2433
+ // We should eventually checkpoint some spans that are ahead of the highwater.
2434
+ // We'll wait until we have two unique timestamps.
2423
2435
testutils .SucceedsSoon (t , func () error {
2424
2436
progress := loadProgress ()
2425
- if loadCheckpoint (t , progress ) != nil {
2437
+ cp := maps .Collect (loadCheckpoint (t , progress ).All ())
2438
+ if len (cp ) >= 2 {
2426
2439
return nil
2427
2440
}
2428
- return errors .New ("waiting for checkpoint" )
2441
+ return errors .New ("waiting for checkpoint with two different timestamps " )
2429
2442
})
2430
2443
2431
2444
sqlDB .Exec (t , "PAUSE JOB $1" , jobID )
2432
2445
waitForJobState (sqlDB , t , jobID , jobs .StatePaused )
2433
2446
2434
2447
// We expect highwater to be 0 (because we skipped some spans) or exactly cursor
2435
- // (this is mostly due to racy updates sent from aggregators to the frontier.
2448
+ // (this is mostly due to racy updates sent from aggregators to the frontier) .
2436
2449
// However, the checkpoint timestamp should be at least at the cursor.
2437
2450
progress := loadProgress ()
2438
- require .True (t , progress .GetHighWater ().IsEmpty () || * progress .GetHighWater () == cursor ,
2439
- "expected empty highwater or %s, found %s" , cursor , progress .GetHighWater ())
2451
+ require .True (t , progress .GetHighWater ().IsEmpty () || progress .GetHighWater (). Equal ( cursor ) ,
2452
+ "expected empty highwater or %s, found %s" , cursor , progress .GetHighWater ())
2440
2453
spanLevelCheckpoint := loadCheckpoint (t , progress )
2441
2454
require .NotNil (t , spanLevelCheckpoint )
2442
- minCheckpointTS := spanLevelCheckpoint .MinTimestamp ()
2443
- require .True (t , cursor .LessEq (minCheckpointTS ))
2455
+ require .True (t , cursor .LessEq (spanLevelCheckpoint .MinTimestamp ()))
2456
+
2457
+ // Construct a reverse index from spans to timestamps.
2458
+ spanTimestamps := make (map [string ]hlc.Timestamp )
2459
+ for ts , spans := range spanLevelCheckpoint .All () {
2460
+ for _ , s := range spans {
2461
+ spanTimestamps [s .String ()] = ts
2462
+ }
2463
+ }
2444
2464
2465
+ var rangefeedStarted bool
2445
2466
var incorrectCheckpointErr error
2446
2467
knobs .FeedKnobs .OnRangeFeedStart = func (spans []kvcoord.SpanTimePair ) {
2468
+ rangefeedStarted = true
2469
+
2447
2470
setErr := func (stp kvcoord.SpanTimePair , expectedTS hlc.Timestamp ) {
2448
2471
incorrectCheckpointErr = errors .Newf (
2449
2472
"rangefeed for span %s expected to start @%s, started @%s instead" ,
2450
2473
stp .Span , expectedTS , stp .StartAfter )
2451
2474
}
2452
2475
2476
+ // Verify that the start time for each span is correct.
2453
2477
for _ , sp := range spans {
2454
- if laggingSpans .Encloses (sp .Span ) {
2455
- if ! sp .StartAfter .Equal (cursor ) {
2456
- setErr (sp , cursor )
2478
+ if checkpointTS := spanTimestamps [sp .Span .String ()]; checkpointTS .IsSet () {
2479
+ // Any span in the checkpoint should be resumed at its checkpoint timestamp.
2480
+ if ! sp .StartAfter .Equal (checkpointTS ) {
2481
+ setErr (sp , checkpointTS )
2457
2482
}
2458
2483
} else {
2459
- if ! sp .StartAfter .Equal (minCheckpointTS ) {
2460
- setErr (sp , minCheckpointTS )
2484
+ // Any spans not in the checkpoint should be at the cursor.
2485
+ if ! sp .StartAfter .Equal (cursor ) {
2486
+ setErr (sp , cursor )
2461
2487
}
2462
2488
}
2463
2489
}
2464
2490
}
2491
+ knobs .FeedKnobs .ShouldSkipCheckpoint = nil
2465
2492
2466
2493
sqlDB .Exec (t , "RESUME JOB $1" , jobID )
2467
2494
waitForJobState (sqlDB , t , jobID , jobs .StateRunning )
2468
2495
2469
2496
// Wait until highwater advances past cursor.
2470
2497
testutils .SucceedsSoon (t , func () error {
2471
2498
progress := loadProgress ()
2472
- if hw := progress .GetHighWater (); hw != nil && cursor .LessEq (* hw ) {
2499
+ if hw := progress .GetHighWater (); hw != nil && cursor .Less (* hw ) {
2473
2500
return nil
2474
2501
}
2475
2502
return errors .New ("waiting for checkpoint advance" )
2476
2503
})
2477
2504
2478
2505
sqlDB .Exec (t , "PAUSE JOB $1" , jobID )
2479
2506
waitForJobState (sqlDB , t , jobID , jobs .StatePaused )
2507
+ // Verify the rangefeed started. This guards against the testing knob
2508
+ // not being called, which was happening in earlier versions of the code.
2509
+ require .True (t , rangefeedStarted )
2480
2510
// Verify we didn't see incorrect timestamps when resuming.
2481
2511
require .NoError (t , incorrectCheckpointErr )
2482
2512
}
0 commit comments