structlogging: restructure hot range logger for testability

angles-n-daemons · angles-n-daemons · commit ab85affc0532 · 2025-03-17T14:00:31.000-04:00
This change does a few things to improve the testability of the hot ranges logger. The includes: __Logger__: * The introduction of a shouldLog function, which determines whether the system should log or not. * The breakout of the logging action into its own function. __Tests__: * The addition of a setup and teardown utility for the hot ranage logger. * The breakout of the default case and a timed case. Fixes: #142995 Epic: CRDB-43150
diff --git a/pkg/server/structlogging/BUILD.bazel b/pkg/server/structlogging/BUILD.bazel
@@ -50,7 +50,6 @@ go_test(
         "//pkg/util/log/logpb",
         "//pkg/util/randutil",
         "//pkg/util/syncutil",
-        "@com_github_cockroachdb_errors//:errors",
         "@com_github_stretchr_testify//assert",
     ],
 )
diff --git a/pkg/server/structlogging/hot_ranges_log.go b/pkg/server/structlogging/hot_ranges_log.go
@@ -24,6 +24,13 @@ import (
 // ReportTopHottestRanges limits the number of ranges to be reported per iteration
 const ReportTopHottestRanges = 5
 
+// HotRangeLogManualTicker is a channel that can be used to force the hot range
+// the logging task to tick.
+// Within normal operation, there will only be one function listening to this
+// ticker, but in the tests there may be multiple "nodes" within the process.
+// Tests then will need to send multiple requests, to trigger all the nodes.
+var HotRangeLogManualTicker = make(chan struct{}, 0)
+
 var TelemetryHotRangesStatsInterval = settings.RegisterDurationSetting(
 	settings.ApplicationLevel,
 	"server.telemetry.hot_ranges_stats.interval",
@@ -50,9 +57,10 @@ var TelemetryHotRangesStatsLoggingDelay = settings.RegisterDurationSetting(
 // hotRangesLoggingScheduler is responsible for logging index usage stats
 // on a scheduled interval.
 type hotRangesLoggingScheduler struct {
-	ie      sql.InternalExecutor
-	sServer serverpb.TenantStatusServer
-	st      *cluster.Settings
+	ie         sql.InternalExecutor
+	sServer    serverpb.TenantStatusServer
+	st         *cluster.Settings
+	lastLogged time.Time
 }
 
 // StartHotRangesLoggingScheduler starts the capture index usage statistics logging scheduler.
@@ -84,7 +92,6 @@ func (s *hotRangesLoggingScheduler) start(ctx context.Context, stopper *stop.Sto
 		})
 
 		ticker := time.NewTicker(TelemetryHotRangesStatsInterval.Get(&s.st.SV))
-		defer ticker.Stop()
 
 		for {
 			select {
@@ -93,43 +100,70 @@ func (s *hotRangesLoggingScheduler) start(ctx context.Context, stopper *stop.Sto
 			case <-ctx.Done():
 				return
 			case <-ticker.C:
-				if !TelemetryHotRangesStatsEnabled.Get(&s.st.SV) {
-					continue
-				}
-				resp, err := s.sServer.HotRangesV2(ctx,
-					&serverpb.HotRangesRequest{NodeID: "local", PageSize: ReportTopHottestRanges})
-				if err != nil {
-					log.Warningf(ctx, "failed to get hot ranges: %s", err)
-					continue
-				}
-				var events []logpb.EventPayload
-				ts := timeutil.Now().UnixNano()
-
-				for _, r := range resp.Ranges {
-					hrEvent := &eventpb.HotRangesStats{
-						RangeID:             int64(r.RangeID),
-						Qps:                 r.QPS,
-						Databases:           r.Databases,
-						Tables:              r.Tables,
-						Indexes:             r.Indexes,
-						SchemaName:          r.SchemaName,
-						CPUTimePerSecond:    r.CPUTimePerSecond,
-						ReadBytesPerSecond:  r.ReadBytesPerSecond,
-						WriteBytesPerSecond: r.WriteBytesPerSecond,
-						ReadsPerSecond:      r.ReadsPerSecond,
-						WritesPerSecond:     r.WritesPerSecond,
-						LeaseholderNodeID:   int32(r.LeaseholderNodeID),
-						CommonEventDetails: logpb.CommonEventDetails{
-							Timestamp: ts,
-						},
-					}
-					events = append(events, hrEvent)
-				}
-				logutil.LogEventsWithDelay(ctx, events, stopper, TelemetryHotRangesStatsLoggingDelay.Get(&s.st.SV))
-
+				s.maybeLogHotRanges(ctx, stopper)
+			case <-HotRangeLogManualTicker:
+				s.maybeLogHotRanges(ctx, stopper)
 			case <-intervalChangedChan:
 				ticker.Reset(TelemetryHotRangesStatsInterval.Get(&s.st.SV))
 			}
 		}
 	})
 }
+
+// maybeLogHotRanges is a small helper function which couples the
+// functionality of checking whether to log and logging, with setting the
+// lastLogged timestamp.
+func (s *hotRangesLoggingScheduler) maybeLogHotRanges(ctx context.Context, stopper *stop.Stopper) {
+	if s.shouldLog() {
+		s.logHotRanges(ctx, stopper)
+	}
+}
+
+// shouldLog checks the below conditions to see whether it should emit logs.
+//   - Is the cluster setting server.telemetry.hot_ranges_stats.enabled true?
+func (s *hotRangesLoggingScheduler) shouldLog() bool {
+	if !TelemetryHotRangesStatsEnabled.Get(&s.st.SV) {
+		return false
+	}
+	return true
+}
+
+// logHotRanges collects the hot ranges from this node's status server and
+// sends them to the TELEMETRY log channel.
+func (s *hotRangesLoggingScheduler) logHotRanges(ctx context.Context, stopper *stop.Stopper) {
+	// early exit conditions
+	if !TelemetryHotRangesStatsEnabled.Get(&s.st.SV) {
+		return
+	}
+	resp, err := s.sServer.HotRangesV2(ctx,
+		&serverpb.HotRangesRequest{NodeID: "local", PageSize: ReportTopHottestRanges})
+	if err != nil {
+		log.Warningf(ctx, "failed to get hot ranges: %s", err)
+		return
+	}
+
+	var events []logpb.EventPayload
+	ts := timeutil.Now().UnixNano()
+
+	for _, r := range resp.Ranges {
+		hrEvent := &eventpb.HotRangesStats{
+			RangeID:             int64(r.RangeID),
+			Qps:                 r.QPS,
+			Databases:           r.Databases,
+			Tables:              r.Tables,
+			Indexes:             r.Indexes,
+			SchemaName:          r.SchemaName,
+			CPUTimePerSecond:    r.CPUTimePerSecond,
+			ReadBytesPerSecond:  r.ReadBytesPerSecond,
+			WriteBytesPerSecond: r.WriteBytesPerSecond,
+			ReadsPerSecond:      r.ReadsPerSecond,
+			WritesPerSecond:     r.WritesPerSecond,
+			LeaseholderNodeID:   int32(r.LeaseholderNodeID),
+			CommonEventDetails: logpb.CommonEventDetails{
+				Timestamp: ts,
+			},
+		}
+		events = append(events, hrEvent)
+	}
+	logutil.LogEventsWithDelay(ctx, events, stopper, TelemetryHotRangesStatsLoggingDelay.Get(&s.st.SV))
+}
diff --git a/pkg/server/structlogging/hot_ranges_log_test.go b/pkg/server/structlogging/hot_ranges_log_test.go
@@ -8,26 +8,24 @@ package structlogging_test
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"regexp"
+	"slices"
 	"testing"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/base"
-	"github.com/cockroachdb/cockroach/pkg/keys"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/plan"
 	"github.com/cockroachdb/cockroach/pkg/server/structlogging"
-	"github.com/cockroachdb/cockroach/pkg/sql/catalog/desctestutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils/skip"
-	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
 	"github.com/cockroachdb/cockroach/pkg/util/log/logpb"
 	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
-	"github.com/cockroachdb/errors"
 	"github.com/stretchr/testify/assert"
 )
 
@@ -74,21 +72,12 @@ func (spy *hotRangesLogSpy) Reset() {
 	spy.mu.logs = nil
 }
 
-// TestHotRangesStatsTenants tests that hot ranges stats are logged per node.
-// The test will ensure each node contains 5 distinct range replicas for hot
-// ranges logging. Each node should thus log 5 distinct range ids.
-func TestHotRangesStats(t *testing.T) {
-	defer leaktest.AfterTest(t)()
+func setupHotRangesLogTest(
+	t *testing.T, ctx context.Context,
+) (serverutils.ApplicationLayerInterface, *hotRangesLogSpy, func()) {
 	sc := log.ScopeWithoutShowLogs(t)
-	defer sc.Close(t)
-
-	skip.UnderRace(t)
-
-	ctx := context.Background()
-	spy := hotRangesLogSpy{t: t}
-	defer log.InterceptWith(ctx, &spy)()
-
-	tc := serverutils.StartCluster(t, 3, base.TestClusterArgs{
+	spy := &hotRangesLogSpy{t: t}
+	tc := serverutils.StartCluster(t, 1, base.TestClusterArgs{
 		ReplicationMode: base.ReplicationManual,
 		ServerArgs: base.TestServerArgs{
 			DefaultTestTenant: base.TestControlsTenantsExplicitly,
@@ -101,72 +90,86 @@ func TestHotRangesStats(t *testing.T) {
 			},
 		},
 	})
-	defer tc.Stopper().Stop(ctx)
-
-	db := tc.ServerConn(0)
-	sqlutils.CreateTable(
-		t, db, "foo",
-		"k INT PRIMARY KEY, v INT",
-		300,
-		sqlutils.ToRowFn(sqlutils.RowIdxFn, sqlutils.RowModuloFn(2)),
-	)
-
-	// Ensure both of node 1 and 2 have 5 distinct replicas from the table.
-	tableDesc := desctestutils.TestingGetPublicTableDescriptor(
-		tc.Server(0).DB(), keys.SystemSQLCodec, "test", "foo")
-	tc.SplitTable(t, tableDesc, []serverutils.SplitPoint{
-		{TargetNodeIdx: 1, Vals: []interface{}{100}},
-		{TargetNodeIdx: 1, Vals: []interface{}{120}},
-		{TargetNodeIdx: 1, Vals: []interface{}{140}},
-		{TargetNodeIdx: 1, Vals: []interface{}{160}},
-		{TargetNodeIdx: 1, Vals: []interface{}{180}},
-		{TargetNodeIdx: 2, Vals: []interface{}{200}},
-		{TargetNodeIdx: 2, Vals: []interface{}{220}},
-		{TargetNodeIdx: 2, Vals: []interface{}{240}},
-		{TargetNodeIdx: 2, Vals: []interface{}{260}},
-		{TargetNodeIdx: 2, Vals: []interface{}{280}},
-	})
 
-	// query table
-	for i := 0; i < 300; i++ {
-		db := tc.ServerConn(0)
-		sqlutils.MakeSQLRunner(db).Query(t, `SELECT * FROM test.foo`)
+	leakChecker := leaktest.AfterTest(t)
+	logInterceptor := log.InterceptWith(ctx, spy)
+	stopper := tc.Stopper()
+	teardown := func() {
+		stopper.Stop(ctx)
+		sc.Close(t)
+		logInterceptor()
+		leakChecker()
 	}
 
-	// Skip node 1 since it will contain many more replicas.
-	// We only need to check nodes 2 and 3 to see that the nodes are logging their local hot ranges.
-	rangeIDs := make(map[int64]struct{})
-	for _, i := range []int{1, 2} {
-		spy.Reset()
-		ts := tc.ApplicationLayer(i)
-		structlogging.TelemetryHotRangesStatsEnabled.Override(ctx, &ts.ClusterSettings().SV, true)
-		structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &ts.ClusterSettings().SV, time.Second)
-		structlogging.TelemetryHotRangesStatsLoggingDelay.Override(ctx, &ts.ClusterSettings().SV, 0*time.Millisecond)
-
-		testutils.SucceedsSoon(t, func() error {
-			logs := spy.Logs()
-			if len(logs) < 5 {
-				return errors.New("waiting for hot ranges to be logged")
-			}
+	ts := tc.ApplicationLayer(0)
+	return ts, spy, teardown
+}
 
-			return nil
-		})
-		structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &ts.ClusterSettings().SV, 1*time.Hour)
+// TestHotRangesStatsTenants tests that hot ranges stats are logged per node.
+// The test will ensure each node contains 5 distinct range replicas for hot
+// ranges logging. Each node should thus log 5 distinct range ids.
+func TestHotRangesStats(t *testing.T) {
+	skip.UnderRace(t)
+	ctx := context.Background()
+	ts, spy, teardown := setupHotRangesLogTest(t, ctx)
+	defer teardown()
+
+	structlogging.TelemetryHotRangesStatsEnabled.Override(ctx, &ts.ClusterSettings().SV, true)
+	structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &ts.ClusterSettings().SV, time.Millisecond)
+	structlogging.TelemetryHotRangesStatsLoggingDelay.Override(ctx, &ts.ClusterSettings().SV, 0*time.Millisecond)
+
+	structlogging.HotRangeLogManualTicker <- struct{}{}
+	testutils.SucceedsSoon(t, func() error {
+		logs := spy.Logs()
 
-		// Get first 5 logs since the logging loop may have fired multiple times.
-		// We should have gotten 5 distinct range ids, one for each split point above.
-		logs := spy.Logs()[:5]
+		// Depend on a range which we don't exist to go anywhere.
 		for _, l := range logs {
-			assert.Equal(t, l.Databases, []string{"‹test›"})
-			assert.Equal(t, l.Tables, []string{"‹foo›"})
-			assert.Equal(t, l.Indexes, []string{"‹foo_pkey›"})
-			_, ok := rangeIDs[l.RangeID]
-			if ok {
-				t.Fatalf(`Logged ranges should be unique per node for this test.
-found range on node %d and node %d: %s %s %s %s %d`, i, l.LeaseholderNodeID, l.Databases, l.SchemaName, l.Tables, l.Indexes, l.RangeID)
+			if !slices.Equal(l.Databases, []string{"‹system›"}) {
+				continue
 			}
-			rangeIDs[l.RangeID] = struct{}{}
+			if !slices.Equal(l.Tables, []string{"‹sqlliveness›"}) {
+				continue
+			}
+			if !slices.Equal(l.Indexes, []string{"‹primary›"}) {
+				continue
+			}
+			return nil
 		}
+		return errors.New("waited too long for the synthetic data")
+	})
+}
 
+func TestHotRangeLogIntervalSetting(t *testing.T) {
+	skip.UnderRace(t)
+	ctx := context.Background()
+	ts, spy, teardown := setupHotRangesLogTest(t, ctx)
+	defer teardown()
+
+	intervalDuration := 50 * time.Millisecond
+	start := time.Now()
+	structlogging.TelemetryHotRangesStatsEnabled.Override(ctx, &ts.ClusterSettings().SV, true)
+	structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &ts.ClusterSettings().SV, intervalDuration)
+	structlogging.TelemetryHotRangesStatsLoggingDelay.Override(ctx, &ts.ClusterSettings().SV, 0*time.Millisecond)
+
+	// very that there's no logged hot ranges, despite the system ticking
+	structlogging.HotRangeLogManualTicker <- struct{}{}
+	if time.Since(start) > intervalDuration {
+		// stress tests can cause scheduling delays, so that little code takes a
+		// long time to run. we can bail if the above took longer than the expected
+		// duration.
+		return
 	}
+
+	// verify no logs were sent.
+	assert.Zero(t, len(spy.Logs()))
+
+	// sleep for the duration, retrigger the logger and verify that logs were sent.
+	time.Sleep(intervalDuration * 2)
+	testutils.SucceedsSoon(t, func() error {
+		structlogging.HotRangeLogManualTicker <- struct{}{}
+		if len(spy.Logs()) == 0 {
+			return errors.New("no logs")
+		}
+		return nil
+	})
 }

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,6 @@ go_test(`
`50`	`50`	`"//pkg/util/log/logpb",`
`51`	`51`	`"//pkg/util/randutil",`
`52`	`52`	`"//pkg/util/syncutil",`
`53`		`- "@com_github_cockroachdb_errors//:errors",`
`54`	`53`	`"@com_github_stretchr_testify//assert",`
`55`	`54`	`],`
`56`	`55`	`)`