Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit00793cc

Browse files
authored
feat: add prometheus observability metrics fordbpurge (#21074)
Related to[`internal#1139`](coder/internal#1139)This implements some prometheus metrics for records being removed fromthe database. Currently we're tracking the following fields beingremoved from the DB by this. They're viewable in the`/api/v2/debug/metrics` endpoint.* `expired_api_keys`* `aibridge_records`* `connection_logs`* `duration````# HELP coderd_dbpurge_iteration_duration_seconds Duration of each dbpurge iteration in seconds.# TYPE coderd_dbpurge_iteration_duration_seconds histogramcoderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="1"} 1coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="5"} 1coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="10"} 1coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="30"} 1coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="60"} 1coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="300"} 1coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="600"} 1coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="+Inf"} 1coderd_dbpurge_iteration_duration_seconds_sum{success="true"} 0.014787814coderd_dbpurge_iteration_duration_seconds_count{success="true"} 1# HELP coderd_dbpurge_records_purged_total Total number of records purged by type.# TYPE coderd_dbpurge_records_purged_total countercoderd_dbpurge_records_purged_total{record_type="aibridge_records"} 0coderd_dbpurge_records_purged_total{record_type="audit_logs"} 0coderd_dbpurge_records_purged_total{record_type="connection_logs"} 0coderd_dbpurge_records_purged_total{record_type="expired_api_keys"} 0coderd_dbpurge_records_purged_total{record_type="workspace_agent_logs"} 0```| Position | Pull-request || -------- | ------------ || ✅ | [feat: add prometheus observability metrics for`dbpurge`](#21074) || | [feat: add rbac specificity for`dbpurge`](#21088) |
1 parent5b3c24c commit00793cc

File tree

3 files changed

+154
-15
lines changed

3 files changed

+154
-15
lines changed

‎cli/server.go‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1038,7 +1038,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
10381038
defershutdownConns()
10391039

10401040
// Ensures that old database entries are cleaned up over time!
1041-
purger:=dbpurge.New(ctx,logger.Named("dbpurge"),options.Database,options.DeploymentValues,quartz.NewReal())
1041+
purger:=dbpurge.New(ctx,logger.Named("dbpurge"),options.Database,options.DeploymentValues,quartz.NewReal(),options.PrometheusRegistry)
10421042
deferpurger.Close()
10431043

10441044
// Updates workspace usage

‎coderd/database/dbpurge/dbpurge.go‎

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77

88
"golang.org/x/xerrors"
99

10+
"github.com/prometheus/client_golang/prometheus"
11+
1012
"cdr.dev/slog"
1113

1214
"github.com/coder/coder/v2/coderd/database"
@@ -40,13 +42,30 @@ const (
4042
// It is the caller's responsibility to call Close on the returned instance.
4143
//
4244
// This is for cleaning up old, unused resources from the database that take up space.
43-
funcNew(ctx context.Context,logger slog.Logger,db database.Store,vals*codersdk.DeploymentValues,clk quartz.Clock) io.Closer {
45+
funcNew(ctx context.Context,logger slog.Logger,db database.Store,vals*codersdk.DeploymentValues,clk quartz.Clock,reg prometheus.Registerer) io.Closer {
4446
closed:=make(chanstruct{})
4547

4648
ctx,cancelFunc:=context.WithCancel(ctx)
4749
//nolint:gocritic // The system purges old db records without user input.
4850
ctx=dbauthz.AsSystemRestricted(ctx)
4951

52+
iterationDuration:=prometheus.NewHistogramVec(prometheus.HistogramOpts{
53+
Namespace:"coderd",
54+
Subsystem:"dbpurge",
55+
Name:"iteration_duration_seconds",
56+
Help:"Duration of each dbpurge iteration in seconds.",
57+
Buckets: []float64{1,5,10,30,60,300,600},// 1s to 10min
58+
}, []string{"success"})
59+
reg.MustRegister(iterationDuration)
60+
61+
recordsPurged:=prometheus.NewCounterVec(prometheus.CounterOpts{
62+
Namespace:"coderd",
63+
Subsystem:"dbpurge",
64+
Name:"records_purged_total",
65+
Help:"Total number of records purged by type.",
66+
}, []string{"record_type"})
67+
reg.MustRegister(recordsPurged)
68+
5069
// Start the ticker with the initial delay.
5170
ticker:=clk.NewTicker(delay)
5271
doTick:=func(ctx context.Context,start time.Time) {
@@ -164,9 +183,22 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, vals *coder
164183
slog.F("duration",clk.Since(start)),
165184
)
166185

186+
duration:=clk.Since(start)
187+
iterationDuration.WithLabelValues("true").Observe(duration.Seconds())
188+
recordsPurged.WithLabelValues("workspace_agent_logs").Add(float64(purgedWorkspaceAgentLogs))
189+
recordsPurged.WithLabelValues("expired_api_keys").Add(float64(expiredAPIKeys))
190+
recordsPurged.WithLabelValues("aibridge_records").Add(float64(purgedAIBridgeRecords))
191+
recordsPurged.WithLabelValues("connection_logs").Add(float64(purgedConnectionLogs))
192+
recordsPurged.WithLabelValues("audit_logs").Add(float64(purgedAuditLogs))
193+
167194
returnnil
168195
},database.DefaultTXOptions().WithID("db_purge"));err!=nil {
169196
logger.Error(ctx,"failed to purge old database entries",slog.Error(err))
197+
198+
// Record metrics for failed purge iteration.
199+
duration:=clk.Since(start)
200+
iterationDuration.WithLabelValues("false").Observe(duration.Seconds())
201+
170202
return
171203
}
172204
}

‎coderd/database/dbpurge/dbpurge_test.go‎

Lines changed: 120 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,17 @@ import (
1212
"time"
1313

1414
"github.com/google/uuid"
15+
"github.com/prometheus/client_golang/prometheus"
1516
"github.com/stretchr/testify/assert"
1617
"github.com/stretchr/testify/require"
1718
"go.uber.org/goleak"
1819
"go.uber.org/mock/gomock"
20+
"golang.org/x/xerrors"
1921

2022
"cdr.dev/slog"
2123
"cdr.dev/slog/sloggers/slogtest"
2224

25+
"github.com/coder/coder/v2/coderd/coderdtest/promhelp"
2326
"github.com/coder/coder/v2/coderd/database"
2427
"github.com/coder/coder/v2/coderd/database/dbgen"
2528
"github.com/coder/coder/v2/coderd/database/dbmock"
@@ -52,11 +55,114 @@ func TestPurge(t *testing.T) {
5255
done:=awaitDoTick(ctx,t,clk)
5356
mDB:=dbmock.NewMockStore(gomock.NewController(t))
5457
mDB.EXPECT().InTx(gomock.Any(),database.DefaultTXOptions().WithID("db_purge")).Return(nil).Times(2)
55-
purger:=dbpurge.New(context.Background(),testutil.Logger(t),mDB,&codersdk.DeploymentValues{},clk)
58+
purger:=dbpurge.New(context.Background(),testutil.Logger(t),mDB,&codersdk.DeploymentValues{},clk,prometheus.NewRegistry())
5659
<-done// wait for doTick() to run.
5760
require.NoError(t,purger.Close())
5861
}
5962

63+
//nolint:paralleltest // It uses LockIDDBPurge.
64+
funcTestMetrics(t*testing.T) {
65+
t.Run("SuccessfulIteration",func(t*testing.T) {
66+
ctx,cancel:=context.WithTimeout(context.Background(),testutil.WaitShort)
67+
defercancel()
68+
69+
reg:=prometheus.NewRegistry()
70+
clk:=quartz.NewMock(t)
71+
now:=time.Date(2025,1,15,7,30,0,0,time.UTC)
72+
clk.Set(now).MustWait(ctx)
73+
74+
db,_:=dbtestutil.NewDB(t)
75+
logger:=slogtest.Make(t,&slogtest.Options{IgnoreErrors:true})
76+
user:=dbgen.User(t,db, database.User{})
77+
78+
oldExpiredKey,_:=dbgen.APIKey(t,db, database.APIKey{
79+
UserID:user.ID,
80+
ExpiresAt:now.Add(-8*24*time.Hour),// Expired 8 days ago
81+
TokenName:"old-expired-key",
82+
})
83+
84+
_,err:=db.GetAPIKeyByID(ctx,oldExpiredKey.ID)
85+
require.NoError(t,err,"key should exist before purge")
86+
87+
done:=awaitDoTick(ctx,t,clk)
88+
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{
89+
Retention: codersdk.RetentionConfig{
90+
APIKeys:serpent.Duration(7*24*time.Hour),// 7 days retention
91+
},
92+
},clk,reg)
93+
defercloser.Close()
94+
testutil.TryReceive(ctx,t,done)
95+
96+
hist:=promhelp.HistogramValue(t,reg,"coderd_dbpurge_iteration_duration_seconds", prometheus.Labels{
97+
"success":"true",
98+
})
99+
require.NotNil(t,hist)
100+
require.Greater(t,hist.GetSampleCount(),uint64(0),"should have at least one sample")
101+
102+
expiredAPIKeys:=promhelp.CounterValue(t,reg,"coderd_dbpurge_records_purged_total", prometheus.Labels{
103+
"record_type":"expired_api_keys",
104+
})
105+
require.Greater(t,expiredAPIKeys,0,"should have deleted at least one expired API key")
106+
107+
_,err=db.GetAPIKeyByID(ctx,oldExpiredKey.ID)
108+
require.Error(t,err,"key should be deleted after purge")
109+
110+
workspaceAgentLogs:=promhelp.CounterValue(t,reg,"coderd_dbpurge_records_purged_total", prometheus.Labels{
111+
"record_type":"workspace_agent_logs",
112+
})
113+
require.GreaterOrEqual(t,workspaceAgentLogs,0)
114+
115+
aibridgeRecords:=promhelp.CounterValue(t,reg,"coderd_dbpurge_records_purged_total", prometheus.Labels{
116+
"record_type":"aibridge_records",
117+
})
118+
require.GreaterOrEqual(t,aibridgeRecords,0)
119+
120+
connectionLogs:=promhelp.CounterValue(t,reg,"coderd_dbpurge_records_purged_total", prometheus.Labels{
121+
"record_type":"connection_logs",
122+
})
123+
require.GreaterOrEqual(t,connectionLogs,0)
124+
125+
auditLogs:=promhelp.CounterValue(t,reg,"coderd_dbpurge_records_purged_total", prometheus.Labels{
126+
"record_type":"audit_logs",
127+
})
128+
require.GreaterOrEqual(t,auditLogs,0)
129+
})
130+
131+
t.Run("FailedIteration",func(t*testing.T) {
132+
ctx,cancel:=context.WithTimeout(context.Background(),testutil.WaitShort)
133+
defercancel()
134+
135+
reg:=prometheus.NewRegistry()
136+
clk:=quartz.NewMock(t)
137+
now:=clk.Now()
138+
clk.Set(now).MustWait(ctx)
139+
140+
ctrl:=gomock.NewController(t)
141+
mDB:=dbmock.NewMockStore(ctrl)
142+
mDB.EXPECT().InTx(gomock.Any(),database.DefaultTXOptions().WithID("db_purge")).
143+
Return(xerrors.New("simulated database error")).
144+
MinTimes(1)
145+
146+
logger:=slogtest.Make(t,&slogtest.Options{IgnoreErrors:true})
147+
148+
done:=awaitDoTick(ctx,t,clk)
149+
closer:=dbpurge.New(ctx,logger,mDB,&codersdk.DeploymentValues{},clk,reg)
150+
defercloser.Close()
151+
testutil.TryReceive(ctx,t,done)
152+
153+
hist:=promhelp.HistogramValue(t,reg,"coderd_dbpurge_iteration_duration_seconds", prometheus.Labels{
154+
"success":"false",
155+
})
156+
require.NotNil(t,hist)
157+
require.Greater(t,hist.GetSampleCount(),uint64(0),"should have at least one sample")
158+
159+
successHist:=promhelp.MetricValue(t,reg,"coderd_dbpurge_iteration_duration_seconds", prometheus.Labels{
160+
"success":"true",
161+
})
162+
require.Nil(t,successHist,"should not have success=true metric on failure")
163+
})
164+
}
165+
60166
//nolint:paralleltest // It uses LockIDDBPurge.
61167
funcTestDeleteOldWorkspaceAgentStats(t*testing.T) {
62168
ctx,cancel:=context.WithTimeout(context.Background(),testutil.WaitLong)
@@ -130,7 +236,7 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
130236
})
131237

132238
// when
133-
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{},clk)
239+
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{},clk,prometheus.NewRegistry())
134240
defercloser.Close()
135241

136242
// then
@@ -155,7 +261,7 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
155261

156262
// Start a new purger to immediately trigger delete after rollup.
157263
_=closer.Close()
158-
closer=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{},clk)
264+
closer=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{},clk,prometheus.NewRegistry())
159265
defercloser.Close()
160266

161267
// then
@@ -250,7 +356,8 @@ func TestDeleteOldWorkspaceAgentLogs(t *testing.T) {
250356
Retention: codersdk.RetentionConfig{
251357
WorkspaceAgentLogs:serpent.Duration(7*24*time.Hour),
252358
},
253-
},clk)
359+
},clk,prometheus.NewRegistry())
360+
254361
defercloser.Close()
255362
<-done// doTick() has now run.
256363

@@ -464,7 +571,7 @@ func TestDeleteOldWorkspaceAgentLogsRetention(t *testing.T) {
464571
done:=awaitDoTick(ctx,t,clk)
465572
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{
466573
Retention:tc.retentionConfig,
467-
},clk)
574+
},clk,prometheus.NewRegistry())
468575
defercloser.Close()
469576
testutil.TryReceive(ctx,t,done)
470577

@@ -555,7 +662,7 @@ func TestDeleteOldProvisionerDaemons(t *testing.T) {
555662
require.NoError(t,err)
556663

557664
// when
558-
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{},clk)
665+
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{},clk,prometheus.NewRegistry())
559666
defercloser.Close()
560667

561668
// then
@@ -659,7 +766,7 @@ func TestDeleteOldAuditLogConnectionEvents(t *testing.T) {
659766

660767
// Run the purge
661768
done:=awaitDoTick(ctx,t,clk)
662-
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{},clk)
769+
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{},clk,prometheus.NewRegistry())
663770
defercloser.Close()
664771
// Wait for tick
665772
testutil.TryReceive(ctx,t,done)
@@ -822,7 +929,7 @@ func TestDeleteOldTelemetryHeartbeats(t *testing.T) {
822929
require.NoError(t,err)
823930

824931
done:=awaitDoTick(ctx,t,clk)
825-
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{},clk)
932+
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{},clk,prometheus.NewRegistry())
826933
defercloser.Close()
827934
<-done// doTick() has now run.
828935

@@ -941,7 +1048,7 @@ func TestDeleteOldConnectionLogs(t *testing.T) {
9411048
done:=awaitDoTick(ctx,t,clk)
9421049
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{
9431050
Retention:tc.retentionConfig,
944-
},clk)
1051+
},clk,prometheus.NewRegistry())
9451052
defercloser.Close()
9461053
testutil.TryReceive(ctx,t,done)
9471054

@@ -1197,7 +1304,7 @@ func TestDeleteOldAIBridgeRecords(t *testing.T) {
11971304
Retention:serpent.Duration(tc.retention),
11981305
},
11991306
},
1200-
},clk)
1307+
},clk,prometheus.NewRegistry())
12011308
defercloser.Close()
12021309
testutil.TryReceive(ctx,t,done)
12031310

@@ -1284,7 +1391,7 @@ func TestDeleteOldAuditLogs(t *testing.T) {
12841391
done:=awaitDoTick(ctx,t,clk)
12851392
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{
12861393
Retention:tc.retentionConfig,
1287-
},clk)
1394+
},clk,prometheus.NewRegistry())
12881395
defercloser.Close()
12891396
testutil.TryReceive(ctx,t,done)
12901397

@@ -1374,7 +1481,7 @@ func TestDeleteOldAuditLogs(t *testing.T) {
13741481
Retention: codersdk.RetentionConfig{
13751482
AuditLogs:serpent.Duration(retentionPeriod),
13761483
},
1377-
},clk)
1484+
},clk,prometheus.NewRegistry())
13781485
defercloser.Close()
13791486
testutil.TryReceive(ctx,t,done)
13801487

@@ -1494,7 +1601,7 @@ func TestDeleteExpiredAPIKeys(t *testing.T) {
14941601
done:=awaitDoTick(ctx,t,clk)
14951602
closer:=dbpurge.New(ctx,logger,db,&codersdk.DeploymentValues{
14961603
Retention:tc.retentionConfig,
1497-
},clk)
1604+
},clk,prometheus.NewRegistry())
14981605
defercloser.Close()
14991606
testutil.TryReceive(ctx,t,done)
15001607

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp