Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit45c43d4

Browse files
authored
fix: refactor agent resource monitoring API to avoid excessive calls to DB (#20430)
This shouldresolvecoder/internal#728 byrefactoring the ResourceMonitorAPI struct to only require querying theresource monitor once for memory and once for volumes, then using thestored monitors on the API struct from that point on. This shouldeliminate the vast majority of calls to `GetWorkspaceByAgentID` and`FetchVolumesResourceMonitorsUpdatedAfter`/`FetchMemoryResourceMonitorsUpdatedAfter`(millions of calls per week).Tests passed, and I ran an instance of coder via a workspace with atemplate that added resource monitoring every 10s. Note that this is thedefault docker container, so there are other sources of`GetWorkspaceByAgentID` db queries. Note that this workspace was runningfor ~15 minutes at the time I gathered this data.Over 30s for the `ResourceMonitor` calls:```coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep ResourceMonitor | grep count % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsByAgentID"} 2coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsUpdatedAfter"} 2100 288k 0 288k 0 0 58.3M 0 --:--:-- --:--:-- --:--:-- 70.4Mcoderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsByAgentID"} 2coderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsUpdatedAfter"} 2coderd_db_query_latencies_seconds_count{query="UpdateMemoryResourceMonitor"} 155coderd_db_query_latencies_seconds_count{query="UpdateVolumeResourceMonitor"} 155coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep ResourceMonitor | grep count % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsByAgentID"} 2coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsUpdatedAfter"} 2100 288k 0 288k 0 0 34.7M 0 --:--:-- --:--:-- --:--:-- 40.2Mcoderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsByAgentID"} 2coderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsUpdatedAfter"} 2coderd_db_query_latencies_seconds_count{query="UpdateMemoryResourceMonitor"} 158coderd_db_query_latencies_seconds_count{query="UpdateVolumeResourceMonitor"} 158```And over 1m for the `GetWorkspaceAgentByID` calls, the majority are fromthe workspace metadata stats updates:```coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep GetWorkspaceByAgentID | grep count % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed100 284k 0 284k 0 0 42.4M 0 --:--:-- --:--:-- --:--:-- 46.3Mcoderd_db_query_latencies_seconds_count{query="GetWorkspaceByAgentID"} 876coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep GetWorkspaceByAgentID | grep count % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed100 284k 0 284k 0 0 75.4M 0 --:--:-- --:--:-- --:--:-- 92.7Mcoderd_db_query_latencies_seconds_count{query="GetWorkspaceByAgentID"} 918```---------Signed-off-by: Callum Styan <callumstyan@gmail.com>
1 parenta1e7e10 commit45c43d4

File tree

3 files changed

+82
-35
lines changed

3 files changed

+82
-35
lines changed

‎coderd/agentapi/api.go‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,10 @@ func (a *API) Serve(ctx context.Context, l net.Listener) error {
239239
returnxerrors.Errorf("create agent API server: %w",err)
240240
}
241241

242+
iferr:=a.ResourcesMonitoringAPI.InitMonitors(ctx);err!=nil {
243+
returnxerrors.Errorf("initialize resource monitoring: %w",err)
244+
}
245+
242246
returnserver.Serve(ctx,l)
243247
}
244248

‎coderd/agentapi/resources_monitoring.go‎

Lines changed: 52 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"database/sql"
66
"errors"
77
"fmt"
8+
"sync"
89
"time"
910

1011
"golang.org/x/xerrors"
@@ -33,42 +34,60 @@ type ResourcesMonitoringAPI struct {
3334

3435
Debounce time.Duration
3536
Config resourcesmonitor.Config
37+
38+
// Cache resource monitors on first call to avoid millions of DB queries per day.
39+
memoryMonitor database.WorkspaceAgentMemoryResourceMonitor
40+
volumeMonitors []database.WorkspaceAgentVolumeResourceMonitor
41+
monitorsLock sync.RWMutex
3642
}
3743

38-
func (a*ResourcesMonitoringAPI)GetResourcesMonitoringConfiguration(ctx context.Context,_*proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse,error) {
39-
memoryMonitor,memoryErr:=a.Database.FetchMemoryResourceMonitorsByAgentID(ctx,a.AgentID)
40-
ifmemoryErr!=nil&&!errors.Is(memoryErr,sql.ErrNoRows) {
41-
returnnil,xerrors.Errorf("failed to fetch memory resource monitor: %w",memoryErr)
44+
// InitMonitors fetches resource monitors from the database and caches them.
45+
// This must be called once after creating a ResourcesMonitoringAPI, the context should be
46+
// the agent per-RPC connection context. If fetching fails with a real error (not sql.ErrNoRows), the
47+
// connection should be torn down.
48+
func (a*ResourcesMonitoringAPI)InitMonitors(ctx context.Context)error {
49+
memMon,err:=a.Database.FetchMemoryResourceMonitorsByAgentID(ctx,a.AgentID)
50+
iferr!=nil&&!errors.Is(err,sql.ErrNoRows) {
51+
returnxerrors.Errorf("fetch memory resource monitor: %w",err)
52+
}
53+
// If sql.ErrNoRows, memoryMonitor stays as zero value (CreatedAt.IsZero() = true).
54+
// Otherwise, store the fetched monitor.
55+
iferr==nil {
56+
a.memoryMonitor=memMon
4257
}
4358

44-
volumeMonitors,err:=a.Database.FetchVolumesResourceMonitorsByAgentID(ctx,a.AgentID)
59+
volMons,err:=a.Database.FetchVolumesResourceMonitorsByAgentID(ctx,a.AgentID)
4560
iferr!=nil {
46-
returnnil,xerrors.Errorf("failed tofetch volume resource monitors: %w",err)
61+
returnxerrors.Errorf("fetch volume resource monitors: %w",err)
4762
}
63+
// 0 length is valid, indicating none configured, since the volume monitors in the DB can be many.
64+
a.volumeMonitors=volMons
65+
66+
returnnil
67+
}
4868

69+
func (a*ResourcesMonitoringAPI)GetResourcesMonitoringConfiguration(_ context.Context,_*proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse,error) {
4970
return&proto.GetResourcesMonitoringConfigurationResponse{
5071
Config:&proto.GetResourcesMonitoringConfigurationResponse_Config{
5172
CollectionIntervalSeconds:int32(a.Config.CollectionInterval.Seconds()),
5273
NumDatapoints:a.Config.NumDatapoints,
5374
},
5475
Memory:func()*proto.GetResourcesMonitoringConfigurationResponse_Memory {
55-
ifmemoryErr!=nil {
76+
ifa.memoryMonitor.CreatedAt.IsZero() {
5677
returnnil
5778
}
58-
5979
return&proto.GetResourcesMonitoringConfigurationResponse_Memory{
60-
Enabled:memoryMonitor.Enabled,
80+
Enabled:a.memoryMonitor.Enabled,
6181
}
6282
}(),
6383
Volumes:func() []*proto.GetResourcesMonitoringConfigurationResponse_Volume {
64-
volumes:=make([]*proto.GetResourcesMonitoringConfigurationResponse_Volume,0,len(volumeMonitors))
65-
for_,monitor:=rangevolumeMonitors {
84+
volumes:=make([]*proto.GetResourcesMonitoringConfigurationResponse_Volume,0,len(a.volumeMonitors))
85+
for_,monitor:=rangea.volumeMonitors {
6686
volumes=append(volumes,&proto.GetResourcesMonitoringConfigurationResponse_Volume{
6787
Enabled:monitor.Enabled,
6888
Path:monitor.Path,
6989
})
7090
}
71-
7291
returnvolumes
7392
}(),
7493
},nil
@@ -77,6 +96,10 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context
7796
func (a*ResourcesMonitoringAPI)PushResourcesMonitoringUsage(ctx context.Context,req*proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse,error) {
7897
varerrerror
7998

99+
// Lock for the entire push operation since calls are sequential from the agent
100+
a.monitorsLock.Lock()
101+
defera.monitorsLock.Unlock()
102+
80103
ifmemoryErr:=a.monitorMemory(ctx,req.Datapoints);memoryErr!=nil {
81104
err=errors.Join(err,xerrors.Errorf("monitor memory: %w",memoryErr))
82105
}
@@ -89,18 +112,7 @@ func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Contex
89112
}
90113

91114
func (a*ResourcesMonitoringAPI)monitorMemory(ctx context.Context,datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint)error {
92-
monitor,err:=a.Database.FetchMemoryResourceMonitorsByAgentID(ctx,a.AgentID)
93-
iferr!=nil {
94-
// It is valid for an agent to not have a memory monitor, so we
95-
// do not want to treat it as an error.
96-
iferrors.Is(err,sql.ErrNoRows) {
97-
returnnil
98-
}
99-
100-
returnxerrors.Errorf("fetch memory resource monitor: %w",err)
101-
}
102-
103-
if!monitor.Enabled {
115+
if!a.memoryMonitor.Enabled {
104116
returnnil
105117
}
106118

@@ -109,15 +121,15 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [
109121
usageDatapoints=append(usageDatapoints,datapoint.Memory)
110122
}
111123

112-
usageStates:=resourcesmonitor.CalculateMemoryUsageStates(monitor,usageDatapoints)
124+
usageStates:=resourcesmonitor.CalculateMemoryUsageStates(a.memoryMonitor,usageDatapoints)
113125

114-
oldState:=monitor.State
126+
oldState:=a.memoryMonitor.State
115127
newState:=resourcesmonitor.NextState(a.Config,oldState,usageStates)
116128

117-
debouncedUntil,shouldNotify:=monitor.Debounce(a.Debounce,a.Clock.Now(),oldState,newState)
129+
debouncedUntil,shouldNotify:=a.memoryMonitor.Debounce(a.Debounce,a.Clock.Now(),oldState,newState)
118130

119131
//nolint:gocritic // We need to be able to update the resource monitor here.
120-
err=a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{
132+
err:=a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{
121133
AgentID:a.AgentID,
122134
State:newState,
123135
UpdatedAt:dbtime.Time(a.Clock.Now()),
@@ -127,6 +139,11 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [
127139
returnxerrors.Errorf("update workspace monitor: %w",err)
128140
}
129141

142+
// Update cached state
143+
a.memoryMonitor.State=newState
144+
a.memoryMonitor.DebouncedUntil=dbtime.Time(debouncedUntil)
145+
a.memoryMonitor.UpdatedAt=dbtime.Time(a.Clock.Now())
146+
130147
if!shouldNotify {
131148
returnnil
132149
}
@@ -143,7 +160,7 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [
143160
notifications.TemplateWorkspaceOutOfMemory,
144161
map[string]string{
145162
"workspace":workspace.Name,
146-
"threshold":fmt.Sprintf("%d%%",monitor.Threshold),
163+
"threshold":fmt.Sprintf("%d%%",a.memoryMonitor.Threshold),
147164
},
148165
map[string]any{
149166
// NOTE(DanielleMaywood):
@@ -169,14 +186,9 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [
169186
}
170187

171188
func (a*ResourcesMonitoringAPI)monitorVolumes(ctx context.Context,datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint)error {
172-
volumeMonitors,err:=a.Database.FetchVolumesResourceMonitorsByAgentID(ctx,a.AgentID)
173-
iferr!=nil {
174-
returnxerrors.Errorf("get or insert volume monitor: %w",err)
175-
}
176-
177189
outOfDiskVolumes:=make([]map[string]any,0)
178190

179-
for_,monitor:=rangevolumeMonitors {
191+
fori,monitor:=rangea.volumeMonitors {
180192
if!monitor.Enabled {
181193
continue
182194
}
@@ -219,6 +231,11 @@ func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints
219231
});err!=nil {
220232
returnxerrors.Errorf("update workspace monitor: %w",err)
221233
}
234+
235+
// Update cached state
236+
a.volumeMonitors[i].State=newState
237+
a.volumeMonitors[i].DebouncedUntil=dbtime.Time(debouncedUntil)
238+
a.volumeMonitors[i].UpdatedAt=dbtime.Time(a.Clock.Now())
222239
}
223240

224241
iflen(outOfDiskVolumes)==0 {

‎coderd/agentapi/resources_monitoring_test.go‎

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ func TestMemoryResourceMonitorDebounce(t *testing.T) {
101101
Threshold:80,
102102
})
103103

104+
// Initialize API to fetch and cache the monitors
105+
require.NoError(t,api.InitMonitors(context.Background()))
106+
104107
// When: The monitor is given a state that will trigger NOK
105108
_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{
106109
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
@@ -304,6 +307,9 @@ func TestMemoryResourceMonitor(t *testing.T) {
304307
Threshold:80,
305308
})
306309

310+
// Initialize API to fetch and cache the monitors
311+
require.NoError(t,api.InitMonitors(context.Background()))
312+
307313
clock.Set(collectedAt)
308314
_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{
309315
Datapoints:datapoints,
@@ -337,6 +343,8 @@ func TestMemoryResourceMonitorMissingData(t *testing.T) {
337343
State:database.WorkspaceAgentMonitorStateOK,
338344
Threshold:80,
339345
})
346+
// Initialize API to fetch and cache the monitors
347+
require.NoError(t,api.InitMonitors(context.Background()))
340348

341349
// When: A datapoint is missing, surrounded by two NOK datapoints.
342350
_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{
@@ -387,6 +395,9 @@ func TestMemoryResourceMonitorMissingData(t *testing.T) {
387395
Threshold:80,
388396
})
389397

398+
// Initialize API to fetch and cache the monitors
399+
require.NoError(t,api.InitMonitors(context.Background()))
400+
390401
// When: A datapoint is missing, surrounded by two OK datapoints.
391402
_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{
392403
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
@@ -466,6 +477,9 @@ func TestVolumeResourceMonitorDebounce(t *testing.T) {
466477
Threshold:80,
467478
})
468479

480+
// Initialize API to fetch and cache the monitors
481+
require.NoError(t,api.InitMonitors(context.Background()))
482+
469483
// When:
470484
// - First monitor is in a NOK state
471485
// - Second monitor is in an OK state
@@ -742,6 +756,9 @@ func TestVolumeResourceMonitor(t *testing.T) {
742756
Threshold:tt.thresholdPercent,
743757
})
744758

759+
// Initialize API to fetch and cache the monitors
760+
require.NoError(t,api.InitMonitors(context.Background()))
761+
745762
clock.Set(collectedAt)
746763
_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{
747764
Datapoints:datapoints,
@@ -780,6 +797,9 @@ func TestVolumeResourceMonitorMultiple(t *testing.T) {
780797
Threshold:80,
781798
})
782799

800+
// Initialize API to fetch and cache the monitors
801+
require.NoError(t,api.InitMonitors(context.Background()))
802+
783803
// When: both of them move to a NOK state
784804
_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{
785805
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
@@ -832,6 +852,9 @@ func TestVolumeResourceMonitorMissingData(t *testing.T) {
832852
Threshold:80,
833853
})
834854

855+
// Initialize API to fetch and cache the monitors
856+
require.NoError(t,api.InitMonitors(context.Background()))
857+
835858
// When: A datapoint is missing, surrounded by two NOK datapoints.
836859
_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{
837860
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
@@ -891,6 +914,9 @@ func TestVolumeResourceMonitorMissingData(t *testing.T) {
891914
Threshold:80,
892915
})
893916

917+
// Initialize API to fetch and cache the monitors
918+
require.NoError(t,api.InitMonitors(context.Background()))
919+
894920
// When: A datapoint is missing, surrounded by two OK datapoints.
895921
_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{
896922
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp