NotificationsYou must be signed in to change notification settings
Fork907
Star10k

Commitd6b9806

authored

chore: implement oom/ood processing component (#16436)

Implements the processing logic as set out in the OOM/OOD RFC.

1 parentb5329ae commitd6b9806Copy full SHA for d6b9806

File tree

26 files changed

+1823

-113

lines changed

coderd
- agentapi
  - api.go
  - resources_monitoring.go
  - resources_monitoring_test.go
  - resourcesmonitor
    - resources_monitor.go
- database
  - dbauthz
    - dbauthz.go
    - dbauthz_test.go
  - dbgen
    - dbgen.go
  - dbmem
    - dbmem.go
  - dbmetrics
    - querymetrics.go
  - dbmock
    - dbmock.go
  - dump.sql
  - migrations
    - 000294_workspace_monitors_state.down.sql
    - 000294_workspace_monitors_state.up.sql
  - modelmethods.go
  - models.go
  - querier.go
  - queries
    - workspaceagentresourcemonitors.sql
  - queries.sql.go
- provisionerdserver
  - provisionerdserver.go
- rbac
  - object_gen.go
  - policy
    - policy.go
  - roles_test.go
- util/slice
  - slice.go
- workspaceagentsrpc.go
codersdk
- rbacresources_gen.go
site/src/api
- rbacresourcesGenerated.ts

26 files changed

+1823

-113

lines changed

`‎coderd/agentapi/api.go`

Lines changed: 25 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -17,10 +17,12 @@ import (`
`17`	`17`
`18`	`18`	`"cdr.dev/slog"`
`19`	`19`	`agentproto"github.com/coder/coder/v2/agent/proto"`
	`20`	`+"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"`
`20`	`21`	`"github.com/coder/coder/v2/coderd/appearance"`
`21`	`22`	`"github.com/coder/coder/v2/coderd/database"`
`22`	`23`	`"github.com/coder/coder/v2/coderd/database/pubsub"`
`23`	`24`	`"github.com/coder/coder/v2/coderd/externalauth"`
	`25`	`+"github.com/coder/coder/v2/coderd/notifications"`
`24`	`26`	`"github.com/coder/coder/v2/coderd/prometheusmetrics"`
`25`	`27`	`"github.com/coder/coder/v2/coderd/tracing"`
`26`	`28`	`"github.com/coder/coder/v2/coderd/workspacestats"`
`@@ -29,6 +31,7 @@ import (`
`29`	`31`	`"github.com/coder/coder/v2/codersdk/agentsdk"`
`30`	`32`	`"github.com/coder/coder/v2/tailnet"`
`31`	`33`	`tailnetproto"github.com/coder/coder/v2/tailnet/proto"`
	`34`	`+"github.com/coder/quartz"`
`32`	`35`	`)`
`33`	`36`
`34`	`37`	`// API implements the DRPC agent API interface from agent/proto. This struct is`
`@@ -59,7 +62,9 @@ type Options struct {`
`59`	`62`
`60`	`63`	`Ctx context.Context`
`61`	`64`	`Log slog.Logger`
	`65`	`+Clock quartz.Clock`
`62`	`66`	`Database database.Store`
	`67`	`+NotificationsEnqueuer notifications.Enqueuer`
`63`	`68`	`Pubsub pubsub.Pubsub`
`64`	`69`	`DerpMapFnfunc()*tailcfg.DERPMap`
`65`	`70`	`TailnetCoordinator*atomic.Pointer[tailnet.Coordinator]`
`@@ -82,6 +87,10 @@ type Options struct {`
`82`	`87`	`}`
`83`	`88`
`84`	`89`	`funcNew(optsOptions)*API {`
	`90`	`+ifopts.Clock==nil {`
	`91`	`+opts.Clock=quartz.NewReal()`
	`92`	`+}`
	`93`	`+`
`85`	`94`	`api:=&API{`
`86`	`95`	`opts:opts,`
`87`	`96`	`mu: sync.Mutex{},`
`@@ -104,9 +113,22 @@ func New(opts Options) *API {`
`104`	`113`	`}`
`105`	`114`
`106`	`115`	`api.ResourcesMonitoringAPI=&ResourcesMonitoringAPI{`
`107`		`-Log:opts.Log,`
`108`		`-AgentID:opts.AgentID,`
`109`		`-Database:opts.Database,`
	`116`	`+AgentID:opts.AgentID,`
	`117`	`+WorkspaceID:opts.WorkspaceID,`
	`118`	`+Clock:opts.Clock,`
	`119`	`+Database:opts.Database,`
	`120`	`+NotificationsEnqueuer:opts.NotificationsEnqueuer,`
	`121`	`+Debounce:5*time.Minute,`
	`122`	`+`
	`123`	`+Config: resourcesmonitor.Config{`
	`124`	`+NumDatapoints:20,`
	`125`	`+CollectionInterval:10*time.Second,`
	`126`	`+`
	`127`	`+Alert: resourcesmonitor.AlertConfig{`
	`128`	`+MinimumNOKsPercent:20,`
	`129`	`+ConsecutiveNOKsPercent:50,`
	`130`	`+},`
	`131`	`+},`
`110`	`132`	`}`
`111`	`133`
`112`	`134`	`api.StatsAPI=&StatsAPI{`

`‎coderd/agentapi/resources_monitoring.go`

Lines changed: 198 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -4,20 +4,35 @@ import (`
`4`	`4`	`"context"`
`5`	`5`	`"database/sql"`
`6`	`6`	`"errors"`
	`7`	`+"fmt"`
	`8`	`+"time"`
`7`	`9`
`8`	`10`	`"golang.org/x/xerrors"`
`9`	`11`
	`12`	`+"cdr.dev/slog"`
	`13`	`+`
`10`	`14`	`"github.com/google/uuid"`
`11`	`15`
`12`		`-"cdr.dev/slog"`
`13`	`16`	`"github.com/coder/coder/v2/agent/proto"`
	`17`	`+"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"`
`14`	`18`	`"github.com/coder/coder/v2/coderd/database"`
	`19`	`+"github.com/coder/coder/v2/coderd/database/dbauthz"`
	`20`	`+"github.com/coder/coder/v2/coderd/database/dbtime"`
	`21`	`+"github.com/coder/coder/v2/coderd/notifications"`
	`22`	`+"github.com/coder/quartz"`
`15`	`23`	`)`
`16`	`24`
`17`	`25`	`typeResourcesMonitoringAPIstruct {`
`18`		`-AgentID uuid.UUID`
`19`		`-Database database.Store`
`20`		`-Log slog.Logger`
	`26`	`+AgentID uuid.UUID`
	`27`	`+WorkspaceID uuid.UUID`
	`28`	`+`
	`29`	`+Log slog.Logger`
	`30`	`+Clock quartz.Clock`
	`31`	`+Database database.Store`
	`32`	`+NotificationsEnqueuer notifications.Enqueuer`
	`33`	`+`
	`34`	`+Debounce time.Duration`
	`35`	`+Config resourcesmonitor.Config`
`21`	`36`	`}`
`22`	`37`
`23`	`38`	`func (aResourcesMonitoringAPI)GetResourcesMonitoringConfiguration(ctx context.Context,_proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse,error) {`
`@@ -33,8 +48,8 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context`
`33`	`48`
`34`	`49`	`return&proto.GetResourcesMonitoringConfigurationResponse{`
`35`	`50`	`Config:&proto.GetResourcesMonitoringConfigurationResponse_Config{`
`36`		`-CollectionIntervalSeconds:10,`
`37`		`-NumDatapoints:20,`
	`51`	`+CollectionIntervalSeconds:int32(a.Config.CollectionInterval.Seconds()),`
	`52`	`+NumDatapoints:a.Config.NumDatapoints,`
`38`	`53`	`},`
`39`	`54`	`Memory:func()*proto.GetResourcesMonitoringConfigurationResponse_Memory {`
`40`	`55`	`ifmemoryErr!=nil {`
`@@ -60,8 +75,182 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context`
`60`	`75`	`}`
`61`	`76`
`62`	`77`	`func (aResourcesMonitoringAPI)PushResourcesMonitoringUsage(ctx context.Context,reqproto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse,error) {`
`63`		`-a.Log.Info(ctx,"resources monitoring usage received",`
`64`		`-slog.F("request",req))`
	`78`	`+varerrerror`
	`79`	`+`
	`80`	`+ifmemoryErr:=a.monitorMemory(ctx,req.Datapoints);memoryErr!=nil {`
	`81`	`+err=errors.Join(err,xerrors.Errorf("monitor memory: %w",memoryErr))`
	`82`	`+}`
	`83`	`+`
	`84`	`+ifvolumeErr:=a.monitorVolumes(ctx,req.Datapoints);volumeErr!=nil {`
	`85`	`+err=errors.Join(err,xerrors.Errorf("monitor volume: %w",volumeErr))`
	`86`	`+}`
	`87`	`+`
	`88`	`+return&proto.PushResourcesMonitoringUsageResponse{},err`
	`89`	`+}`
	`90`	`+`
	`91`	`+func (aResourcesMonitoringAPI)monitorMemory(ctx context.Context,datapoints []proto.PushResourcesMonitoringUsageRequest_Datapoint)error {`
	`92`	`+monitor,err:=a.Database.FetchMemoryResourceMonitorsByAgentID(ctx,a.AgentID)`
	`93`	`+iferr!=nil {`
	`94`	`+// It is valid for an agent to not have a memory monitor, so we`
	`95`	`+// do not want to treat it as an error.`
	`96`	`+iferrors.Is(err,sql.ErrNoRows) {`
	`97`	`+returnnil`
	`98`	`+}`
	`99`	`+`
	`100`	`+returnxerrors.Errorf("fetch memory resource monitor: %w",err)`
	`101`	`+}`
	`102`	`+`
	`103`	`+if!monitor.Enabled {`
	`104`	`+returnnil`
	`105`	`+}`
	`106`	`+`
	`107`	`+usageDatapoints:=make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage,0,len(datapoints))`
	`108`	`+for_,datapoint:=rangedatapoints {`
	`109`	`+usageDatapoints=append(usageDatapoints,datapoint.Memory)`
	`110`	`+}`
	`111`	`+`
	`112`	`+usageStates:=resourcesmonitor.CalculateMemoryUsageStates(monitor,usageDatapoints)`
	`113`	`+`
	`114`	`+oldState:=monitor.State`
	`115`	`+newState:=resourcesmonitor.NextState(a.Config,oldState,usageStates)`
	`116`	`+`
	`117`	`+debouncedUntil,shouldNotify:=monitor.Debounce(a.Debounce,a.Clock.Now(),oldState,newState)`
	`118`	`+`
	`119`	`+//nolint:gocritic // We need to be able to update the resource monitor here.`
	`120`	`+err=a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{`
	`121`	`+AgentID:a.AgentID,`
	`122`	`+State:newState,`
	`123`	`+UpdatedAt:dbtime.Time(a.Clock.Now()),`
	`124`	`+DebouncedUntil:dbtime.Time(debouncedUntil),`
	`125`	`+})`
	`126`	`+iferr!=nil {`
	`127`	`+returnxerrors.Errorf("update workspace monitor: %w",err)`
	`128`	`+}`
	`129`	`+`
	`130`	`+if!shouldNotify {`
	`131`	`+returnnil`
	`132`	`+}`
	`133`	`+`
	`134`	`+workspace,err:=a.Database.GetWorkspaceByID(ctx,a.WorkspaceID)`
	`135`	`+iferr!=nil {`
	`136`	`+returnxerrors.Errorf("get workspace by id: %w",err)`
	`137`	`+}`
	`138`	`+`
	`139`	`+_,err=a.NotificationsEnqueuer.EnqueueWithData(`
	`140`	`+// nolint:gocritic // We need to be able to send the notification.`
	`141`	`+dbauthz.AsNotifier(ctx),`
	`142`	`+workspace.OwnerID,`
	`143`	`+notifications.TemplateWorkspaceOutOfMemory,`
	`144`	`+map[string]string{`
	`145`	`+"workspace":workspace.Name,`
	`146`	`+"threshold":fmt.Sprintf("%d%%",monitor.Threshold),`
	`147`	`+},`
	`148`	`+map[string]any{`
	`149`	`+// NOTE(DanielleMaywood):`
	`150`	`+// When notifications are enqueued, they are checked to be`
	`151`	`+// unique within a single day. This means that if we attempt`
	`152`	`+// to send two OOM notifications for the same workspace on`
	`153`	`+// the same day, the enqueuer will prevent us from sending`
	`154`	`+// a second one. We are inject a timestamp to make the`
	`155`	`+// notifications appear different enough to circumvent this`
	`156`	`+// deduplication logic.`
	`157`	`+"timestamp":a.Clock.Now(),`
	`158`	`+},`
	`159`	`+"workspace-monitor-memory",`
	`160`	`+)`
	`161`	`+iferr!=nil {`
	`162`	`+returnxerrors.Errorf("notify workspace OOM: %w",err)`
	`163`	`+}`
	`164`	`+`
	`165`	`+returnnil`
	`166`	`+}`
	`167`	`+`
	`168`	`+func (aResourcesMonitoringAPI)monitorVolumes(ctx context.Context,datapoints []proto.PushResourcesMonitoringUsageRequest_Datapoint)error {`
	`169`	`+volumeMonitors,err:=a.Database.FetchVolumesResourceMonitorsByAgentID(ctx,a.AgentID)`
	`170`	`+iferr!=nil {`
	`171`	`+returnxerrors.Errorf("get or insert volume monitor: %w",err)`
	`172`	`+}`
	`173`	`+`
	`174`	`+outOfDiskVolumes:=make([]map[string]any,0)`
	`175`	`+`
	`176`	`+for_,monitor:=rangevolumeMonitors {`
	`177`	`+if!monitor.Enabled {`
	`178`	`+continue`
	`179`	`+}`
	`180`	`+`
	`181`	`+usageDatapoints:=make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage,0,len(datapoints))`
	`182`	`+for_,datapoint:=rangedatapoints {`
	`183`	`+varusage*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage`
	`184`	`+`
	`185`	`+for_,volume:=rangedatapoint.Volumes {`
	`186`	`+ifvolume.Volume==monitor.Path {`
	`187`	`+usage=volume`
	`188`	`+break`
	`189`	`+}`
	`190`	`+}`
	`191`	`+`
	`192`	`+usageDatapoints=append(usageDatapoints,usage)`
	`193`	`+}`
	`194`	`+`
	`195`	`+usageStates:=resourcesmonitor.CalculateVolumeUsageStates(monitor,usageDatapoints)`
	`196`	`+`
	`197`	`+oldState:=monitor.State`
	`198`	`+newState:=resourcesmonitor.NextState(a.Config,oldState,usageStates)`
	`199`	`+`
	`200`	`+debouncedUntil,shouldNotify:=monitor.Debounce(a.Debounce,a.Clock.Now(),oldState,newState)`
	`201`	`+`
	`202`	`+ifshouldNotify {`
	`203`	`+outOfDiskVolumes=append(outOfDiskVolumes,map[string]any{`
	`204`	`+"path":monitor.Path,`
	`205`	`+"threshold":fmt.Sprintf("%d%%",monitor.Threshold),`
	`206`	`+})`
	`207`	`+}`
	`208`	`+`
	`209`	`+//nolint:gocritic // We need to be able to update the resource monitor here.`
	`210`	`+iferr:=a.Database.UpdateVolumeResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateVolumeResourceMonitorParams{`
	`211`	`+AgentID:a.AgentID,`
	`212`	`+Path:monitor.Path,`
	`213`	`+State:newState,`
	`214`	`+UpdatedAt:dbtime.Time(a.Clock.Now()),`
	`215`	`+DebouncedUntil:dbtime.Time(debouncedUntil),`
	`216`	`+});err!=nil {`
	`217`	`+returnxerrors.Errorf("update workspace monitor: %w",err)`
	`218`	`+}`
	`219`	`+}`
	`220`	`+`
	`221`	`+iflen(outOfDiskVolumes)==0 {`
	`222`	`+returnnil`
	`223`	`+}`
	`224`	`+`
	`225`	`+workspace,err:=a.Database.GetWorkspaceByID(ctx,a.WorkspaceID)`
	`226`	`+iferr!=nil {`
	`227`	`+returnxerrors.Errorf("get workspace by id: %w",err)`
	`228`	`+}`
	`229`	`+`
	`230`	`+if_,err:=a.NotificationsEnqueuer.EnqueueWithData(`
	`231`	`+// nolint:gocritic // We need to be able to send the notification.`
	`232`	`+dbauthz.AsNotifier(ctx),`
	`233`	`+workspace.OwnerID,`
	`234`	`+notifications.TemplateWorkspaceOutOfDisk,`
	`235`	`+map[string]string{`
	`236`	`+"workspace":workspace.Name,`
	`237`	`+},`
	`238`	`+map[string]any{`
	`239`	`+"volumes":outOfDiskVolumes,`
	`240`	`+// NOTE(DanielleMaywood):`
	`241`	`+// When notifications are enqueued, they are checked to be`
	`242`	`+// unique within a single day. This means that if we attempt`
	`243`	`+// to send two OOM notifications for the same workspace on`
	`244`	`+// the same day, the enqueuer will prevent us from sending`
	`245`	`+// a second one. We are inject a timestamp to make the`
	`246`	`+// notifications appear different enough to circumvent this`
	`247`	`+// deduplication logic.`
	`248`	`+"timestamp":a.Clock.Now(),`
	`249`	`+},`
	`250`	`+"workspace-monitor-volumes",`
	`251`	`+);err!=nil {`
	`252`	`+returnxerrors.Errorf("notify workspace OOD: %w",err)`
	`253`	`+}`
`65`	`254`
`66`		`-return&proto.PushResourcesMonitoringUsageResponse{},nil`
	`255`	`+returnnil`
`67`	`256`	`}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitd6b9806

File tree

26 files changed

26 files changed

`‎coderd/agentapi/api.go`

`‎coderd/agentapi/resources_monitoring.go`

0 commit comments