NotificationsYou must be signed in to change notification settings
Fork1.1k
Star11.4k

Commit45c43d4

authored

fix: refactor agent resource monitoring API to avoid excessive calls to DB (#20430)

This shouldresolvecoder/internal#728 byrefactoring the ResourceMonitorAPI struct to only require querying theresource monitor once for memory and once for volumes, then using thestored monitors on the API struct from that point on. This shouldeliminate the vast majority of calls to `GetWorkspaceByAgentID` and`FetchVolumesResourceMonitorsUpdatedAfter`/`FetchMemoryResourceMonitorsUpdatedAfter`(millions of calls per week).Tests passed, and I ran an instance of coder via a workspace with atemplate that added resource monitoring every 10s. Note that this is thedefault docker container, so there are other sources of`GetWorkspaceByAgentID` db queries. Note that this workspace was runningfor ~15 minutes at the time I gathered this data.Over 30s for the `ResourceMonitor` calls:```coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep ResourceMonitor | grep count % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsByAgentID"} 2coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsUpdatedAfter"} 2100 288k 0 288k 0 0 58.3M 0 --:--:-- --:--:-- --:--:-- 70.4Mcoderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsByAgentID"} 2coderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsUpdatedAfter"} 2coderd_db_query_latencies_seconds_count{query="UpdateMemoryResourceMonitor"} 155coderd_db_query_latencies_seconds_count{query="UpdateVolumeResourceMonitor"} 155coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep ResourceMonitor | grep count % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsByAgentID"} 2coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsUpdatedAfter"} 2100 288k 0 288k 0 0 34.7M 0 --:--:-- --:--:-- --:--:-- 40.2Mcoderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsByAgentID"} 2coderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsUpdatedAfter"} 2coderd_db_query_latencies_seconds_count{query="UpdateMemoryResourceMonitor"} 158coderd_db_query_latencies_seconds_count{query="UpdateVolumeResourceMonitor"} 158```And over 1m for the `GetWorkspaceAgentByID` calls, the majority are fromthe workspace metadata stats updates:```coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep GetWorkspaceByAgentID | grep count % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed100 284k 0 284k 0 0 42.4M 0 --:--:-- --:--:-- --:--:-- 46.3Mcoderd_db_query_latencies_seconds_count{query="GetWorkspaceByAgentID"} 876coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep GetWorkspaceByAgentID | grep count % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed100 284k 0 284k 0 0 75.4M 0 --:--:-- --:--:-- --:--:-- 92.7Mcoderd_db_query_latencies_seconds_count{query="GetWorkspaceByAgentID"} 918```---------Signed-off-by: Callum Styan <callumstyan@gmail.com>

1 parenta1e7e10 commit45c43d4Copy full SHA for 45c43d4

File tree

3 files changed

+82

-35

lines changed

coderd/agentapi

3 files changed

+82

-35

lines changed

`‎coderd/agentapi/api.go‎`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -239,6 +239,10 @@ func (a *API) Serve(ctx context.Context, l net.Listener) error {`
`239`	`239`	`returnxerrors.Errorf("create agent API server: %w",err)`
`240`	`240`	`}`
`241`	`241`
	`242`	`+iferr:=a.ResourcesMonitoringAPI.InitMonitors(ctx);err!=nil {`
	`243`	`+returnxerrors.Errorf("initialize resource monitoring: %w",err)`
	`244`	`+}`
	`245`	`+`
`242`	`246`	`returnserver.Serve(ctx,l)`
`243`	`247`	`}`
`244`	`248`

`‎coderd/agentapi/resources_monitoring.go‎`

Lines changed: 52 additions & 35 deletions

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@ import (`
`5`	`5`	`"database/sql"`
`6`	`6`	`"errors"`
`7`	`7`	`"fmt"`
	`8`	`+"sync"`
`8`	`9`	`"time"`
`9`	`10`
`10`	`11`	`"golang.org/x/xerrors"`
`@@ -33,42 +34,60 @@ type ResourcesMonitoringAPI struct {`
`33`	`34`
`34`	`35`	`Debounce time.Duration`
`35`	`36`	`Config resourcesmonitor.Config`
	`37`	`+`
	`38`	`+// Cache resource monitors on first call to avoid millions of DB queries per day.`
	`39`	`+memoryMonitor database.WorkspaceAgentMemoryResourceMonitor`
	`40`	`+volumeMonitors []database.WorkspaceAgentVolumeResourceMonitor`
	`41`	`+monitorsLock sync.RWMutex`
`36`	`42`	`}`
`37`	`43`
`38`		`-func (aResourcesMonitoringAPI)GetResourcesMonitoringConfiguration(ctx context.Context,_proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse,error) {`
`39`		`-memoryMonitor,memoryErr:=a.Database.FetchMemoryResourceMonitorsByAgentID(ctx,a.AgentID)`
`40`		`-ifmemoryErr!=nil&&!errors.Is(memoryErr,sql.ErrNoRows) {`
`41`		`-returnnil,xerrors.Errorf("failed to fetch memory resource monitor: %w",memoryErr)`
	`44`	`+// InitMonitors fetches resource monitors from the database and caches them.`
	`45`	`+// This must be called once after creating a ResourcesMonitoringAPI, the context should be`
	`46`	`+// the agent per-RPC connection context. If fetching fails with a real error (not sql.ErrNoRows), the`
	`47`	`+// connection should be torn down.`
	`48`	`+func (a*ResourcesMonitoringAPI)InitMonitors(ctx context.Context)error {`
	`49`	`+memMon,err:=a.Database.FetchMemoryResourceMonitorsByAgentID(ctx,a.AgentID)`
	`50`	`+iferr!=nil&&!errors.Is(err,sql.ErrNoRows) {`
	`51`	`+returnxerrors.Errorf("fetch memory resource monitor: %w",err)`
	`52`	`+}`
	`53`	`+// If sql.ErrNoRows, memoryMonitor stays as zero value (CreatedAt.IsZero() = true).`
	`54`	`+// Otherwise, store the fetched monitor.`
	`55`	`+iferr==nil {`
	`56`	`+a.memoryMonitor=memMon`
`42`	`57`	`}`
`43`	`58`
`44`		`-volumeMonitors,err:=a.Database.FetchVolumesResourceMonitorsByAgentID(ctx,a.AgentID)`
	`59`	`+volMons,err:=a.Database.FetchVolumesResourceMonitorsByAgentID(ctx,a.AgentID)`
`45`	`60`	`iferr!=nil {`
`46`		`-returnnil,xerrors.Errorf("failed tofetch volume resource monitors: %w",err)`
	`61`	`+returnxerrors.Errorf("fetch volume resource monitors: %w",err)`
`47`	`62`	`}`
	`63`	`+// 0 length is valid, indicating none configured, since the volume monitors in the DB can be many.`
	`64`	`+a.volumeMonitors=volMons`
	`65`	`+`
	`66`	`+returnnil`
	`67`	`+}`
`48`	`68`
	`69`	`+func (aResourcesMonitoringAPI)GetResourcesMonitoringConfiguration(_ context.Context,_proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse,error) {`
`49`	`70`	`return&proto.GetResourcesMonitoringConfigurationResponse{`
`50`	`71`	`Config:&proto.GetResourcesMonitoringConfigurationResponse_Config{`
`51`	`72`	`CollectionIntervalSeconds:int32(a.Config.CollectionInterval.Seconds()),`
`52`	`73`	`NumDatapoints:a.Config.NumDatapoints,`
`53`	`74`	`},`
`54`	`75`	`Memory:func()*proto.GetResourcesMonitoringConfigurationResponse_Memory {`
`55`		`-ifmemoryErr!=nil {`
	`76`	`+ifa.memoryMonitor.CreatedAt.IsZero() {`
`56`	`77`	`returnnil`
`57`	`78`	`}`
`58`		`-`
`59`	`79`	`return&proto.GetResourcesMonitoringConfigurationResponse_Memory{`
`60`		`-Enabled:memoryMonitor.Enabled,`
	`80`	`+Enabled:a.memoryMonitor.Enabled,`
`61`	`81`	`}`
`62`	`82`	`}(),`
`63`	`83`	`Volumes:func() []*proto.GetResourcesMonitoringConfigurationResponse_Volume {`
`64`		`-volumes:=make([]*proto.GetResourcesMonitoringConfigurationResponse_Volume,0,len(volumeMonitors))`
`65`		`-for_,monitor:=rangevolumeMonitors {`
	`84`	`+volumes:=make([]*proto.GetResourcesMonitoringConfigurationResponse_Volume,0,len(a.volumeMonitors))`
	`85`	`+for_,monitor:=rangea.volumeMonitors {`
`66`	`86`	`volumes=append(volumes,&proto.GetResourcesMonitoringConfigurationResponse_Volume{`
`67`	`87`	`Enabled:monitor.Enabled,`
`68`	`88`	`Path:monitor.Path,`
`69`	`89`	`})`
`70`	`90`	`}`
`71`		`-`
`72`	`91`	`returnvolumes`
`73`	`92`	`}(),`
`74`	`93`	`},nil`
`@@ -77,6 +96,10 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context`
`77`	`96`	`func (aResourcesMonitoringAPI)PushResourcesMonitoringUsage(ctx context.Context,reqproto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse,error) {`
`78`	`97`	`varerrerror`
`79`	`98`
	`99`	`+// Lock for the entire push operation since calls are sequential from the agent`
	`100`	`+a.monitorsLock.Lock()`
	`101`	`+defera.monitorsLock.Unlock()`
	`102`	`+`
`80`	`103`	`ifmemoryErr:=a.monitorMemory(ctx,req.Datapoints);memoryErr!=nil {`
`81`	`104`	`err=errors.Join(err,xerrors.Errorf("monitor memory: %w",memoryErr))`
`82`	`105`	`}`
`@@ -89,18 +112,7 @@ func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Contex`
`89`	`112`	`}`
`90`	`113`
`91`	`114`	`func (aResourcesMonitoringAPI)monitorMemory(ctx context.Context,datapoints []proto.PushResourcesMonitoringUsageRequest_Datapoint)error {`
`92`		`-monitor,err:=a.Database.FetchMemoryResourceMonitorsByAgentID(ctx,a.AgentID)`
`93`		`-iferr!=nil {`
`94`		`-// It is valid for an agent to not have a memory monitor, so we`
`95`		`-// do not want to treat it as an error.`
`96`		`-iferrors.Is(err,sql.ErrNoRows) {`
`97`		`-returnnil`
`98`		`-}`
`99`		`-`
`100`		`-returnxerrors.Errorf("fetch memory resource monitor: %w",err)`
`101`		`-}`
`102`		`-`
`103`		`-if!monitor.Enabled {`
	`115`	`+if!a.memoryMonitor.Enabled {`
`104`	`116`	`returnnil`
`105`	`117`	`}`
`106`	`118`
`@@ -109,15 +121,15 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [`
`109`	`121`	`usageDatapoints=append(usageDatapoints,datapoint.Memory)`
`110`	`122`	`}`
`111`	`123`
`112`		`-usageStates:=resourcesmonitor.CalculateMemoryUsageStates(monitor,usageDatapoints)`
	`124`	`+usageStates:=resourcesmonitor.CalculateMemoryUsageStates(a.memoryMonitor,usageDatapoints)`
`113`	`125`
`114`		`-oldState:=monitor.State`
	`126`	`+oldState:=a.memoryMonitor.State`
`115`	`127`	`newState:=resourcesmonitor.NextState(a.Config,oldState,usageStates)`
`116`	`128`
`117`		`-debouncedUntil,shouldNotify:=monitor.Debounce(a.Debounce,a.Clock.Now(),oldState,newState)`
	`129`	`+debouncedUntil,shouldNotify:=a.memoryMonitor.Debounce(a.Debounce,a.Clock.Now(),oldState,newState)`
`118`	`130`
`119`	`131`	`//nolint:gocritic // We need to be able to update the resource monitor here.`
`120`		`-err=a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{`
	`132`	`+err:=a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{`
`121`	`133`	`AgentID:a.AgentID,`
`122`	`134`	`State:newState,`
`123`	`135`	`UpdatedAt:dbtime.Time(a.Clock.Now()),`
`@@ -127,6 +139,11 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [`
`127`	`139`	`returnxerrors.Errorf("update workspace monitor: %w",err)`
`128`	`140`	`}`
`129`	`141`
	`142`	`+// Update cached state`
	`143`	`+a.memoryMonitor.State=newState`
	`144`	`+a.memoryMonitor.DebouncedUntil=dbtime.Time(debouncedUntil)`
	`145`	`+a.memoryMonitor.UpdatedAt=dbtime.Time(a.Clock.Now())`
	`146`	`+`
`130`	`147`	`if!shouldNotify {`
`131`	`148`	`returnnil`
`132`	`149`	`}`
`@@ -143,7 +160,7 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [`
`143`	`160`	`notifications.TemplateWorkspaceOutOfMemory,`
`144`	`161`	`map[string]string{`
`145`	`162`	`"workspace":workspace.Name,`
`146`		`-"threshold":fmt.Sprintf("%d%%",monitor.Threshold),`
	`163`	`+"threshold":fmt.Sprintf("%d%%",a.memoryMonitor.Threshold),`
`147`	`164`	`},`
`148`	`165`	`map[string]any{`
`149`	`166`	`// NOTE(DanielleMaywood):`
`@@ -169,14 +186,9 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [`
`169`	`186`	`}`
`170`	`187`
`171`	`188`	`func (aResourcesMonitoringAPI)monitorVolumes(ctx context.Context,datapoints []proto.PushResourcesMonitoringUsageRequest_Datapoint)error {`
`172`		`-volumeMonitors,err:=a.Database.FetchVolumesResourceMonitorsByAgentID(ctx,a.AgentID)`
`173`		`-iferr!=nil {`
`174`		`-returnxerrors.Errorf("get or insert volume monitor: %w",err)`
`175`		`-}`
`176`		`-`
`177`	`189`	`outOfDiskVolumes:=make([]map[string]any,0)`
`178`	`190`
`179`		`-for_,monitor:=rangevolumeMonitors {`
	`191`	`+fori,monitor:=rangea.volumeMonitors {`
`180`	`192`	`if!monitor.Enabled {`
`181`	`193`	`continue`
`182`	`194`	`}`
`@@ -219,6 +231,11 @@ func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints`
`219`	`231`	`});err!=nil {`
`220`	`232`	`returnxerrors.Errorf("update workspace monitor: %w",err)`
`221`	`233`	`}`
	`234`	`+`
	`235`	`+// Update cached state`
	`236`	`+a.volumeMonitors[i].State=newState`
	`237`	`+a.volumeMonitors[i].DebouncedUntil=dbtime.Time(debouncedUntil)`
	`238`	`+a.volumeMonitors[i].UpdatedAt=dbtime.Time(a.Clock.Now())`
`222`	`239`	`}`
`223`	`240`
`224`	`241`	`iflen(outOfDiskVolumes)==0 {`

`‎coderd/agentapi/resources_monitoring_test.go‎`

Lines changed: 26 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,9 @@ func TestMemoryResourceMonitorDebounce(t *testing.T) {`
`101`	`101`	`Threshold:80,`
`102`	`102`	`})`
`103`	`103`
	`104`	`+// Initialize API to fetch and cache the monitors`
	`105`	`+require.NoError(t,api.InitMonitors(context.Background()))`
	`106`	`+`
`104`	`107`	`// When: The monitor is given a state that will trigger NOK`
`105`	`108`	`_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{`
`106`	`109`	`Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{`
`@@ -304,6 +307,9 @@ func TestMemoryResourceMonitor(t *testing.T) {`
`304`	`307`	`Threshold:80,`
`305`	`308`	`})`
`306`	`309`
	`310`	`+// Initialize API to fetch and cache the monitors`
	`311`	`+require.NoError(t,api.InitMonitors(context.Background()))`
	`312`	`+`
`307`	`313`	`clock.Set(collectedAt)`
`308`	`314`	`_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{`
`309`	`315`	`Datapoints:datapoints,`
`@@ -337,6 +343,8 @@ func TestMemoryResourceMonitorMissingData(t *testing.T) {`
`337`	`343`	`State:database.WorkspaceAgentMonitorStateOK,`
`338`	`344`	`Threshold:80,`
`339`	`345`	`})`
	`346`	`+// Initialize API to fetch and cache the monitors`
	`347`	`+require.NoError(t,api.InitMonitors(context.Background()))`
`340`	`348`
`341`	`349`	`// When: A datapoint is missing, surrounded by two NOK datapoints.`
`342`	`350`	`_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{`
`@@ -387,6 +395,9 @@ func TestMemoryResourceMonitorMissingData(t *testing.T) {`
`387`	`395`	`Threshold:80,`
`388`	`396`	`})`
`389`	`397`
	`398`	`+// Initialize API to fetch and cache the monitors`
	`399`	`+require.NoError(t,api.InitMonitors(context.Background()))`
	`400`	`+`
`390`	`401`	`// When: A datapoint is missing, surrounded by two OK datapoints.`
`391`	`402`	`_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{`
`392`	`403`	`Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{`
`@@ -466,6 +477,9 @@ func TestVolumeResourceMonitorDebounce(t *testing.T) {`
`466`	`477`	`Threshold:80,`
`467`	`478`	`})`
`468`	`479`
	`480`	`+// Initialize API to fetch and cache the monitors`
	`481`	`+require.NoError(t,api.InitMonitors(context.Background()))`
	`482`	`+`
`469`	`483`	`// When:`
`470`	`484`	`// - First monitor is in a NOK state`
`471`	`485`	`// - Second monitor is in an OK state`
`@@ -742,6 +756,9 @@ func TestVolumeResourceMonitor(t *testing.T) {`
`742`	`756`	`Threshold:tt.thresholdPercent,`
`743`	`757`	`})`
`744`	`758`
	`759`	`+// Initialize API to fetch and cache the monitors`
	`760`	`+require.NoError(t,api.InitMonitors(context.Background()))`
	`761`	`+`
`745`	`762`	`clock.Set(collectedAt)`
`746`	`763`	`_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{`
`747`	`764`	`Datapoints:datapoints,`
`@@ -780,6 +797,9 @@ func TestVolumeResourceMonitorMultiple(t *testing.T) {`
`780`	`797`	`Threshold:80,`
`781`	`798`	`})`
`782`	`799`
	`800`	`+// Initialize API to fetch and cache the monitors`
	`801`	`+require.NoError(t,api.InitMonitors(context.Background()))`
	`802`	`+`
`783`	`803`	`// When: both of them move to a NOK state`
`784`	`804`	`_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{`
`785`	`805`	`Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{`
`@@ -832,6 +852,9 @@ func TestVolumeResourceMonitorMissingData(t *testing.T) {`
`832`	`852`	`Threshold:80,`
`833`	`853`	`})`
`834`	`854`
	`855`	`+// Initialize API to fetch and cache the monitors`
	`856`	`+require.NoError(t,api.InitMonitors(context.Background()))`
	`857`	`+`
`835`	`858`	`// When: A datapoint is missing, surrounded by two NOK datapoints.`
`836`	`859`	`_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{`
`837`	`860`	`Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{`
`@@ -891,6 +914,9 @@ func TestVolumeResourceMonitorMissingData(t *testing.T) {`
`891`	`914`	`Threshold:80,`
`892`	`915`	`})`
`893`	`916`
	`917`	`+// Initialize API to fetch and cache the monitors`
	`918`	`+require.NoError(t,api.InitMonitors(context.Background()))`
	`919`	`+`
`894`	`920`	`// When: A datapoint is missing, surrounded by two OK datapoints.`
`895`	`921`	`_,err:=api.PushResourcesMonitoringUsage(context.Background(),&agentproto.PushResourcesMonitoringUsageRequest{`
`896`	`922`	`Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit45c43d4

File tree

3 files changed

3 files changed

`‎coderd/agentapi/api.go‎`

`‎coderd/agentapi/resources_monitoring.go‎`

`‎coderd/agentapi/resources_monitoring_test.go‎`

0 commit comments