Skip to content

Commit 94db609

Browse files
craig[bot]yuzefovich
craig[bot]
andcommitted
142963: sql/stats: unset histogram buckets proto after decoding r=yuzefovich a=yuzefovich **sql/stats,backup: adjust table stats handling in backup** This commit introduces a new method to return the protos of table statistics directly from the DB (i.e. it by-passes the table stats cache) and uses this method in the backup code. The main motivation is that we want to nil out the histogram buckets in the proto that we store in the table stats cache since we don't need them outside of the backup. The new method hardens the logic of excluding merged and forecast stats from the backup because we simply no longer include these stats into what backups includes (because they are generated on demand rather than read from the system table). It additionally ignores whether "stats usage is allowed" on a particular table since it seems beneficial to backup stats for all requested tables, and then once stats are restored we can decide whether to actually use them. Notable change for the new `GetTableStatsProtosFromDB` method is that we no longer hydrate the column type nor decode upper bounds since this isn't needed for the backup use case. **stats: unset histogram buckets proto after decoding** I think it should be safe to unset `HistogramData.Buckets` proto after having decoded the histogram buckets since we only use `cat.HistogramBucket` with decoded datums going forward (the only exception was the backup use case, but it was adjusted in the previous commit to have a separate code path). I noticed the proto was showing up noticeably in the inuse-space heap profiles on a customer cluster. It also adds a comment for why we're use `*bool` as `forecast` argument in `getTableStatsFromCache`, as opposed to just `bool`. Epic: None Release note: None Co-authored-by: Yahor Yuzefovich <[email protected]>
2 parents d7d5113 + 8fd5626 commit 94db609

File tree

5 files changed

+104
-40
lines changed

5 files changed

+104
-40
lines changed

pkg/backup/backup_compaction.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -879,7 +879,7 @@ func concludeBackupCompaction(
879879
}
880880
}
881881

882-
statsTable := getTableStatsForBackup(ctx, execCtx.ExecCfg().TableStatsCache, backupManifest.Descriptors)
882+
statsTable := getTableStatsForBackup(ctx, execCtx.ExecCfg().InternalDB.Executor(), backupManifest.Descriptors)
883883
return backupinfo.WriteTableStatistics(ctx, store, encryption, kmsEnv, &statsTable)
884884
}
885885

pkg/backup/backup_job.go

+9-13
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@ func backup(
135135
backupManifest *backuppb.BackupManifest,
136136
makeExternalStorage cloud.ExternalStorageFactory,
137137
encryption *jobspb.BackupEncryptionOptions,
138-
statsCache *stats.TableStatisticsCache,
139138
execLocality roachpb.Locality,
140139
) (_ roachpb.RowCount, numBackupInstances int, _ error) {
141140
resumerSpan := tracing.SpanFromContext(ctx)
@@ -435,7 +434,7 @@ func backup(
435434
}
436435
}
437436

438-
statsTable := getTableStatsForBackup(ctx, statsCache, backupManifest.Descriptors)
437+
statsTable := getTableStatsForBackup(ctx, execCtx.ExecCfg().InternalDB.Executor(), backupManifest.Descriptors)
439438
if err := backupinfo.WriteTableStatistics(ctx, defaultStore, encryption, &kmsEnv, &statsTable); err != nil {
440439
return roachpb.RowCount{}, 0, err
441440
}
@@ -468,25 +467,24 @@ func releaseProtectedTimestamp(
468467
// to suboptimal performance when reading/writing to this table until
469468
// the stats have been recomputed.
470469
func getTableStatsForBackup(
471-
ctx context.Context, statsCache *stats.TableStatisticsCache, descs []descpb.Descriptor,
470+
ctx context.Context, executor isql.Executor, descs []descpb.Descriptor,
472471
) backuppb.StatsTable {
473472
var tableStatistics []*stats.TableStatisticProto
474473
for i := range descs {
475474
if tbl, _, _, _, _ := descpb.GetDescriptors(&descs[i]); tbl != nil {
476475
tableDesc := tabledesc.NewBuilder(tbl).BuildImmutableTable()
477-
// nil typeResolver means that we'll use the latest committed type
478-
// metadata which is acceptable.
479-
tableStatisticsAcc, err := statsCache.GetTableStats(ctx, tableDesc, nil /* typeResolver */)
476+
tableStatisticsAcc, err := stats.GetTableStatsProtosFromDB(ctx, tableDesc, executor)
480477
if err != nil {
481-
log.Warningf(ctx, "failed to collect stats for table: %s, "+
482-
"table ID: %d during a backup: %s", tableDesc.GetName(), tableDesc.GetID(),
483-
err.Error())
478+
log.Warningf(
479+
ctx, "failed to collect stats for table: %s, table ID: %d during a backup: %s",
480+
tableDesc.GetName(), tableDesc.GetID(), err,
481+
)
484482
continue
485483
}
486484

487485
for _, stat := range tableStatisticsAcc {
488-
if statShouldBeIncludedInBackupRestore(&stat.TableStatisticProto) {
489-
tableStatistics = append(tableStatistics, &stat.TableStatisticProto)
486+
if statShouldBeIncludedInBackupRestore(stat) {
487+
tableStatistics = append(tableStatistics, stat)
490488
}
491489
}
492490
}
@@ -699,7 +697,6 @@ func (b *backupResumer) Resume(ctx context.Context, execCtx interface{}) error {
699697
}
700698
}
701699

702-
statsCache := p.ExecCfg().TableStatsCache
703700
// We retry on pretty generic failures -- any rpc error. If a worker node were
704701
// to restart, it would produce this kind of error, but there may be other
705702
// errors that are also rpc errors. Don't retry too aggressively.
@@ -735,7 +732,6 @@ func (b *backupResumer) Resume(ctx context.Context, execCtx interface{}) error {
735732
backupManifest,
736733
p.ExecCfg().DistSQLSrv.ExternalStorage,
737734
details.EncryptionOptions,
738-
statsCache,
739735
details.ExecutionLocality,
740736
)
741737
if err == nil {

pkg/sql/stats/automatic_stats.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -887,7 +887,11 @@ func (r *Refresher) maybeRefreshStats(
887887
partialStatsEnabled bool,
888888
fullStatsEnabled bool,
889889
) {
890-
tableStats, err := r.cache.getTableStatsFromCache(ctx, tableID, nil /* forecast */, nil /* udtCols */, nil /* typeResolver */)
890+
// NB: we pass nil boolean as 'forecast' argument in order to not invalidate
891+
// the stats cache entry since we don't care whether there is a forecast or
892+
// not in the stats.
893+
var forecast *bool
894+
tableStats, err := r.cache.getTableStatsFromCache(ctx, tableID, forecast, nil /* udtCols */, nil /* typeResolver */)
891895
if err != nil {
892896
log.Errorf(ctx, "failed to get table statistics: %v", err)
893897
return

pkg/sql/stats/stats_cache.go

+85-25
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/cockroachdb/cockroach/pkg/sql/catalog/catenumpb"
2424
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
2525
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
26+
"github.com/cockroachdb/cockroach/pkg/sql/isql"
2627
"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
2728
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
2829
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
@@ -47,15 +48,9 @@ type TableStatistic struct {
4748
Histogram []cat.HistogramBucket
4849
}
4950

50-
// A TableStatisticsCache contains two underlying LRU caches:
51-
// (1) A cache of []*TableStatistic objects, keyed by table ID.
52-
//
53-
// Each entry consists of all the statistics for different columns and
54-
// column groups for the given table.
55-
//
56-
// (2) A cache of *HistogramData objects, keyed by
57-
//
58-
// HistogramCacheKey{table ID, statistic ID}.
51+
// A TableStatisticsCache contains an LRU cache of []*TableStatistic objects,
52+
// keyed by table ID. Each entry consists of all the statistics for different
53+
// columns and column groups for the given table.
5954
type TableStatisticsCache struct {
6055
// NB: This can't be a RWMutex for lookup because UnorderedCache.Get
6156
// manipulates an internal LRU list.
@@ -232,7 +227,7 @@ func decodeTableStatisticsKV(
232227
//
233228
// The function returns an error if we could not query the system table. It
234229
// silently ignores any statistics that can't be decoded (e.g. because
235-
// user-defined types don't exit).
230+
// user-defined types don't exist).
236231
//
237232
// The statistics are ordered by their CreatedAt time (newest-to-oldest).
238233
func (sc *TableStatisticsCache) GetTableStats(
@@ -247,6 +242,23 @@ func (sc *TableStatisticsCache) GetTableStats(
247242
)
248243
}
249244

245+
// GetTableStatsProtosFromDB looks up statistics for the requested table in
246+
// system.table_statistics, by-passing the cache. Merged (based on partial and
247+
// latest full) and forecast stats are **not** included in the result.
248+
//
249+
// NB: the ColumnType field of HistogramData is not hydrated.
250+
//
251+
// The function returns an error if we could not query the system table. It
252+
// silently ignores any statistics that can't be decoded (e.g. because
253+
// user-defined types don't exist).
254+
//
255+
// The statistics are ordered by their CreatedAt time (newest-to-oldest).
256+
func GetTableStatsProtosFromDB(
257+
ctx context.Context, table catalog.TableDescriptor, executor isql.Executor,
258+
) (statsProtos []*TableStatisticProto, err error) {
259+
return getTableStatsProtosFromDB(ctx, table.GetID(), executor)
260+
}
261+
250262
// DisallowedOnSystemTable returns true if this tableID belongs to a special
251263
// system table on which we want to disallow stats collection and stats usage.
252264
func DisallowedOnSystemTable(tableID descpb.ID) bool {
@@ -705,6 +717,9 @@ func (sc *TableStatisticsCache) parseStats(
705717
if err = DecodeHistogramBuckets(res); err != nil {
706718
return nil, nil, err
707719
}
720+
// Update the HistogramData proto to nil out Buckets field to allow for
721+
// the memory to be GCed.
722+
res.HistogramData.Buckets = nil
708723
}
709724
return res, udt, nil
710725
}
@@ -810,19 +825,9 @@ func (tsp *TableStatisticProto) IsAuto() bool {
810825
return tsp.Name == jobspb.AutoStatsName
811826
}
812827

813-
// getTableStatsFromDB retrieves the statistics in system.table_statistics
814-
// for the given table ID.
815-
//
816-
// It ignores any statistics that cannot be decoded (e.g. because a user-defined
817-
// type that doesn't exist) and returns the rest (with no error).
818-
func (sc *TableStatisticsCache) getTableStatsFromDB(
819-
ctx context.Context,
820-
tableID descpb.ID,
821-
forecast bool,
822-
st *cluster.Settings,
823-
typeResolver *descs.DistSQLTypeResolver,
824-
) (_ []*TableStatistic, _ map[descpb.ColumnID]*types.T, err error) {
825-
getTableStatisticsStmt := `
828+
// TODO(michae2): Add an index on system.table_statistics (tableID, createdAt,
829+
// columnIDs, statisticID).
830+
const getTableStatisticsStmt = `
826831
SELECT
827832
"tableID",
828833
"statisticID",
@@ -840,9 +845,19 @@ FROM system.table_statistics
840845
WHERE "tableID" = $1
841846
ORDER BY "createdAt" DESC, "columnIDs" DESC, "statisticID" DESC
842847
`
843-
// TODO(michae2): Add an index on system.table_statistics (tableID, createdAt,
844-
// columnIDs, statisticID).
845848

849+
// getTableStatsFromDB retrieves the statistics in system.table_statistics
850+
// for the given table ID.
851+
//
852+
// It ignores any statistics that cannot be decoded (e.g. because of a
853+
// user-defined type that doesn't exist) and returns the rest (with no error).
854+
func (sc *TableStatisticsCache) getTableStatsFromDB(
855+
ctx context.Context,
856+
tableID descpb.ID,
857+
forecast bool,
858+
st *cluster.Settings,
859+
typeResolver *descs.DistSQLTypeResolver,
860+
) (_ []*TableStatistic, _ map[descpb.ColumnID]*types.T, err error) {
846861
it, err := sc.db.Executor().QueryIteratorEx(
847862
ctx, "get-table-statistics", nil /* txn */, sessiondata.NodeUserSessionDataOverride, getTableStatisticsStmt, tableID,
848863
)
@@ -912,3 +927,48 @@ ORDER BY "createdAt" DESC, "columnIDs" DESC, "statisticID" DESC
912927

913928
return statsList, udts, nil
914929
}
930+
931+
// getTableStatsProtosFromDB retrieves the statistics in system.table_statistics
932+
// for the given table ID, in "un-decoded" protobuf form.
933+
//
934+
// It ignores any statistics that cannot be decoded (e.g. because of a
935+
// user-defined type that doesn't exist) and returns the rest (with no error).
936+
func getTableStatsProtosFromDB(
937+
ctx context.Context, tableID descpb.ID, executor isql.Executor,
938+
) (statsProtos []*TableStatisticProto, err error) {
939+
it, err := executor.QueryIteratorEx(
940+
ctx, "get-table-statistics-protos", nil /* txn */, sessiondata.NodeUserSessionDataOverride, getTableStatisticsStmt, tableID,
941+
)
942+
if err != nil {
943+
return nil, err
944+
}
945+
946+
// Guard against crashes in the code below.
947+
defer func() {
948+
if r := recover(); r != nil {
949+
// In the event of a "safe" panic, we only want to log the error and
950+
// continue executing the query without stats for this table. This is only
951+
// possible because the code does not update shared state and does not
952+
// manipulate locks.
953+
if ok, e := errorutil.ShouldCatch(r); ok {
954+
err = e
955+
} else {
956+
// Other panic objects can't be considered "safe" and thus are
957+
// propagated as crashes that terminate the session.
958+
panic(r)
959+
}
960+
}
961+
}()
962+
963+
var ok bool
964+
for ok, err = it.Next(ctx); ok; ok, err = it.Next(ctx) {
965+
var tsp *TableStatisticProto
966+
tsp, err = NewTableStatisticProto(it.Cur())
967+
if err != nil {
968+
log.Warningf(ctx, "could not decode statistic for table %d: %v", tableID, err)
969+
continue
970+
}
971+
statsProtos = append(statsProtos, tsp)
972+
}
973+
return statsProtos, nil
974+
}

pkg/sql/stats/stats_cache_test.go

+4
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,10 @@ func initTestData(
197197
return nil, err
198198
}
199199

200+
if stat.HistogramData != nil {
201+
// Match the behavior from TableStatisticsCache.parseStats.
202+
stat.HistogramData.Buckets = nil
203+
}
200204
expectedStats[stat.TableID] = append(expectedStats[stat.TableID], stat)
201205
}
202206

0 commit comments

Comments
 (0)