Skip to content

Commit 200ae5a

Browse files
security: change client cert expiration caching eviction policy
Previously, we cached and emitted metrics for client certificates indefinitely or until they expired. However, this strategy ignores the case where a team rotates their certificates, discontinuing to use soon-to-expire ones, in which crdb would continue to report on their expiration. This change modifies the caching behavior so that eviction from the cache is based on last usage. It does a few additional things here: - Supports the removal of child metrics by key. - Removes the `server.client_cert_expiration_cache.capacity` cluster setting. - Adds a shared utility for adding jitter. - Moves caching and metrics reporting from a single cache to two caches, one for expiration, one for ttl. Fixes: #142686 Epic: CRDB-40209 Release note(ops change): Cluster setting `server.client_cert_expiration_cache.capacity` removed. Metrics `security.certificate.expiration.client` and `security.certificate.ttl.client` now report lowest value seen for user in last 24h.
1 parent 3b1b7fe commit 200ae5a

17 files changed

+647
-472
lines changed

docs/generated/settings/settings-for-tenants.txt

-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,6 @@ server.auth_log.sql_sessions.enabled boolean false if set, log verbose SQL sessi
112112
server.authentication_cache.enabled boolean true enables a cache used during authentication to avoid lookups to system tables when retrieving per-user authentication-related information application
113113
server.child_metrics.enabled boolean false enables the exporting of child metrics, additional prometheus time series with extra labels application
114114
server.child_metrics.include_aggregate.enabled boolean true include the reporting of the aggregate time series when child metrics are enabled. This cluster setting has no effect if child metrics are disabled. application
115-
server.client_cert_expiration_cache.capacity integer 1000 the maximum number of client cert expirations stored application
116115
server.clock.forward_jump_check.enabled (alias: server.clock.forward_jump_check_enabled) boolean false if enabled, forward clock jumps > max_offset/2 will cause a panic application
117116
server.clock.persist_upper_bound_interval duration 0s the interval between persisting the wall time upper bound of the clock. The clock does not generate a wall time greater than the persisted timestamp and will panic if it sees a wall time greater than this value. When cockroach starts, it waits for the wall time to catch-up till this persisted timestamp. This guarantees monotonic wall time across server restarts. Not setting this or setting a value of 0 disables this feature. application
118117
server.eventlog.enabled boolean true if set, logged notable events are also stored in the table system.eventlog application

docs/generated/settings/settings.html

-1
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@
143143
<tr><td><div id="setting-server-authentication-cache-enabled" class="anchored"><code>server.authentication_cache.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>enables a cache used during authentication to avoid lookups to system tables when retrieving per-user authentication-related information</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
144144
<tr><td><div id="setting-server-child-metrics-enabled" class="anchored"><code>server.child_metrics.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>enables the exporting of child metrics, additional prometheus time series with extra labels</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
145145
<tr><td><div id="setting-server-child-metrics-include-aggregate-enabled" class="anchored"><code>server.child_metrics.include_aggregate.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>include the reporting of the aggregate time series when child metrics are enabled. This cluster setting has no effect if child metrics are disabled.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
146-
<tr><td><div id="setting-server-client-cert-expiration-cache-capacity" class="anchored"><code>server.client_cert_expiration_cache.capacity</code></div></td><td>integer</td><td><code>1000</code></td><td>the maximum number of client cert expirations stored</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
147146
<tr><td><div id="setting-server-clock-forward-jump-check-enabled" class="anchored"><code>server.clock.forward_jump_check.enabled<br />(alias: server.clock.forward_jump_check_enabled)</code></div></td><td>boolean</td><td><code>false</code></td><td>if enabled, forward clock jumps &gt; max_offset/2 will cause a panic</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
148147
<tr><td><div id="setting-server-clock-persist-upper-bound-interval" class="anchored"><code>server.clock.persist_upper_bound_interval</code></div></td><td>duration</td><td><code>0s</code></td><td>the interval between persisting the wall time upper bound of the clock. The clock does not generate a wall time greater than the persisted timestamp and will panic if it sees a wall time greater than this value. When cockroach starts, it waits for the wall time to catch-up till this persisted timestamp. This guarantees monotonic wall time across server restarts. Not setting this or setting a value of 0 disables this feature.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
149148
<tr><td><div id="setting-server-consistency-check-max-rate" class="anchored"><code>server.consistency_check.max_rate</code></div></td><td>byte size</td><td><code>8.0 MiB</code></td><td>the rate limit (bytes/sec) to use for consistency checks; used in conjunction with server.consistency_check.interval to control the frequency of consistency checks. Note that setting this too high can negatively impact performance.</td><td>Dedicated/Self-Hosted</td></tr>

pkg/security/BUILD.bazel

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ go_library(
4040
"//pkg/util/log/severity",
4141
"//pkg/util/metric",
4242
"//pkg/util/metric/aggmetric",
43+
"//pkg/util/mon",
4344
"//pkg/util/quotapool",
4445
"//pkg/util/randutil",
4546
"//pkg/util/stop",

pkg/security/auth.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -353,11 +353,15 @@ func UserAuthCertHook(
353353

354354
if ValidateUserScope(certUserScope, systemIdentity, tenantID, tenantName, roleSubject, certSubject) {
355355
if certManager != nil {
356-
certManager.MaybeUpsertClientExpiration(
356+
err := certManager.MaybeUpsertClientExpiration(
357357
ctx,
358358
systemIdentity,
359+
peerCert.SerialNumber.String(),
359360
peerCert.NotAfter.Unix(),
360361
)
362+
if err != nil {
363+
return err
364+
}
361365
}
362366
return nil
363367
}

pkg/security/certificate_manager.go

+41-15
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/cockroachdb/cockroach/pkg/util/log"
1717
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
1818
"github.com/cockroachdb/cockroach/pkg/util/log/severity"
19+
"github.com/cockroachdb/cockroach/pkg/util/mon"
1920
"github.com/cockroachdb/cockroach/pkg/util/stop"
2021
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
2122
"github.com/cockroachdb/cockroach/pkg/util/sysutil"
@@ -55,7 +56,10 @@ type CertificateManager struct {
5556
certMetrics *Metrics
5657

5758
// Client cert expiration cache.
58-
clientCertExpirationCache *clientcert.ClientCertExpirationCache
59+
clientCertExpirationCache *clientcert.Cache
60+
61+
// Client cert ttl cache.
62+
clientCertTTLCache *clientcert.Cache
5963

6064
// mu protects all remaining fields.
6165
mu syncutil.RWMutex
@@ -182,7 +186,10 @@ func (cm *CertificateManager) RegisterSignalHandler(
182186
case sig := <-ch:
183187
log.Ops.Infof(ctx, "received signal %q, triggering certificate reload", sig)
184188
if cache := cm.clientCertExpirationCache; cache != nil {
185-
cache.Clear()
189+
cache.Clear(ctx)
190+
}
191+
if cache := cm.clientCertTTLCache; cache != nil {
192+
cache.Clear(ctx)
186193
}
187194
if err := cm.LoadCertificates(); err != nil {
188195
log.Ops.Warningf(ctx, "could not reload certificates: %v", err)
@@ -195,26 +202,45 @@ func (cm *CertificateManager) RegisterSignalHandler(
195202
})
196203
}
197204

198-
// RegisterExpirationCache registers a cache for client certificate expiration.
199-
// It is called during server startup.
200-
func (cm *CertificateManager) RegisterExpirationCache(cache *clientcert.ClientCertExpirationCache) {
201-
cm.clientCertExpirationCache = cache
205+
// RegisterExpirationCache sets up and attaches caches for client certificate
206+
// expiration times and TTLs.
207+
func (cm *CertificateManager) RegisterExpirationCaches(
208+
ctx context.Context, stopper *stop.Stopper, parentMon *mon.BytesMonitor,
209+
) error {
210+
m := mon.NewMonitorInheritWithLimit(
211+
"client-expiration-caches", 0 /* limit */, parentMon, true, /* longLiving */
212+
)
213+
acc := m.MakeConcurrentBoundAccount()
214+
m.StartNoReserved(ctx, parentMon)
215+
216+
cm.clientCertExpirationCache = clientcert.NewCache(acc, stopper, cm.certMetrics.ClientExpiration, &timeutil.DefaultTimeSource{}, false)
217+
cm.clientCertTTLCache = clientcert.NewCache(acc, stopper, cm.certMetrics.ClientTTL, &timeutil.DefaultTimeSource{}, true)
218+
err := cm.clientCertExpirationCache.StartPurgeLoop(ctx)
219+
if err != nil {
220+
return err
221+
}
222+
err = cm.clientCertTTLCache.StartPurgeLoop(ctx)
223+
if err != nil {
224+
return err
225+
}
226+
return nil
202227
}
203228

204229
// MaybeUpsertClientExpiration updates or inserts the expiration time for the
205230
// given client certificate. An update is contingent on whether the old
206231
// expiration is after the new expiration.
207232
func (cm *CertificateManager) MaybeUpsertClientExpiration(
208-
ctx context.Context, identity string, expiration int64,
209-
) {
210-
if cache := cm.clientCertExpirationCache; cache != nil {
211-
cache.MaybeUpsert(ctx,
212-
identity,
213-
expiration,
214-
cm.certMetrics.ClientExpiration,
215-
cm.certMetrics.ClientTTL,
216-
)
233+
ctx context.Context, identity, serial string, expiration int64,
234+
) error {
235+
for _, cache := range []*clientcert.Cache{cm.clientCertExpirationCache, cm.clientCertTTLCache} {
236+
if cache != nil {
237+
err := cache.Upsert(ctx, identity, serial, expiration)
238+
if err != nil {
239+
return err
240+
}
241+
}
217242
}
243+
return nil
218244
}
219245

220246
// CACert returns the CA cert. May be nil.

pkg/security/clientcert/BUILD.bazel

-7
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,7 @@ go_library(
66
importpath = "github.com/cockroachdb/cockroach/pkg/security/clientcert",
77
visibility = ["//visibility:public"],
88
deps = [
9-
"//pkg/settings",
10-
"//pkg/settings/cluster",
11-
"//pkg/util/cache",
12-
"//pkg/util/log",
139
"//pkg/util/metric/aggmetric",
14-
"//pkg/util/mon",
1510
"//pkg/util/stop",
1611
"//pkg/util/syncutil",
1712
"//pkg/util/timeutil",
@@ -23,14 +18,12 @@ go_test(
2318
srcs = ["cert_expiry_cache_test.go"],
2419
deps = [
2520
":clientcert",
26-
"//pkg/settings/cluster",
2721
"//pkg/util/leaktest",
2822
"//pkg/util/metric",
2923
"//pkg/util/metric/aggmetric",
3024
"//pkg/util/mon",
3125
"//pkg/util/stop",
3226
"//pkg/util/timeutil",
33-
"@com_github_prometheus_client_model//go",
3427
"@com_github_stretchr_testify//require",
3528
],
3629
)

0 commit comments

Comments
 (0)