Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd6b9806

Browse files
chore: implement oom/ood processing component (#16436)
Implements the processing logic as set out in the OOM/OOD RFC.
1 parentb5329ae commitd6b9806

26 files changed

+1823
-113
lines changed

‎coderd/agentapi/api.go

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ import (
1717

1818
"cdr.dev/slog"
1919
agentproto"github.com/coder/coder/v2/agent/proto"
20+
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
2021
"github.com/coder/coder/v2/coderd/appearance"
2122
"github.com/coder/coder/v2/coderd/database"
2223
"github.com/coder/coder/v2/coderd/database/pubsub"
2324
"github.com/coder/coder/v2/coderd/externalauth"
25+
"github.com/coder/coder/v2/coderd/notifications"
2426
"github.com/coder/coder/v2/coderd/prometheusmetrics"
2527
"github.com/coder/coder/v2/coderd/tracing"
2628
"github.com/coder/coder/v2/coderd/workspacestats"
@@ -29,6 +31,7 @@ import (
2931
"github.com/coder/coder/v2/codersdk/agentsdk"
3032
"github.com/coder/coder/v2/tailnet"
3133
tailnetproto"github.com/coder/coder/v2/tailnet/proto"
34+
"github.com/coder/quartz"
3235
)
3336

3437
// API implements the DRPC agent API interface from agent/proto. This struct is
@@ -59,7 +62,9 @@ type Options struct {
5962

6063
Ctx context.Context
6164
Log slog.Logger
65+
Clock quartz.Clock
6266
Database database.Store
67+
NotificationsEnqueuer notifications.Enqueuer
6368
Pubsub pubsub.Pubsub
6469
DerpMapFnfunc()*tailcfg.DERPMap
6570
TailnetCoordinator*atomic.Pointer[tailnet.Coordinator]
@@ -82,6 +87,10 @@ type Options struct {
8287
}
8388

8489
funcNew(optsOptions)*API {
90+
ifopts.Clock==nil {
91+
opts.Clock=quartz.NewReal()
92+
}
93+
8594
api:=&API{
8695
opts:opts,
8796
mu: sync.Mutex{},
@@ -104,9 +113,22 @@ func New(opts Options) *API {
104113
}
105114

106115
api.ResourcesMonitoringAPI=&ResourcesMonitoringAPI{
107-
Log:opts.Log,
108-
AgentID:opts.AgentID,
109-
Database:opts.Database,
116+
AgentID:opts.AgentID,
117+
WorkspaceID:opts.WorkspaceID,
118+
Clock:opts.Clock,
119+
Database:opts.Database,
120+
NotificationsEnqueuer:opts.NotificationsEnqueuer,
121+
Debounce:5*time.Minute,
122+
123+
Config: resourcesmonitor.Config{
124+
NumDatapoints:20,
125+
CollectionInterval:10*time.Second,
126+
127+
Alert: resourcesmonitor.AlertConfig{
128+
MinimumNOKsPercent:20,
129+
ConsecutiveNOKsPercent:50,
130+
},
131+
},
110132
}
111133

112134
api.StatsAPI=&StatsAPI{

‎coderd/agentapi/resources_monitoring.go

Lines changed: 198 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,35 @@ import (
44
"context"
55
"database/sql"
66
"errors"
7+
"fmt"
8+
"time"
79

810
"golang.org/x/xerrors"
911

12+
"cdr.dev/slog"
13+
1014
"github.com/google/uuid"
1115

12-
"cdr.dev/slog"
1316
"github.com/coder/coder/v2/agent/proto"
17+
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
1418
"github.com/coder/coder/v2/coderd/database"
19+
"github.com/coder/coder/v2/coderd/database/dbauthz"
20+
"github.com/coder/coder/v2/coderd/database/dbtime"
21+
"github.com/coder/coder/v2/coderd/notifications"
22+
"github.com/coder/quartz"
1523
)
1624

1725
typeResourcesMonitoringAPIstruct {
18-
AgentID uuid.UUID
19-
Database database.Store
20-
Log slog.Logger
26+
AgentID uuid.UUID
27+
WorkspaceID uuid.UUID
28+
29+
Log slog.Logger
30+
Clock quartz.Clock
31+
Database database.Store
32+
NotificationsEnqueuer notifications.Enqueuer
33+
34+
Debounce time.Duration
35+
Config resourcesmonitor.Config
2136
}
2237

2338
func (a*ResourcesMonitoringAPI)GetResourcesMonitoringConfiguration(ctx context.Context,_*proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse,error) {
@@ -33,8 +48,8 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context
3348

3449
return&proto.GetResourcesMonitoringConfigurationResponse{
3550
Config:&proto.GetResourcesMonitoringConfigurationResponse_Config{
36-
CollectionIntervalSeconds:10,
37-
NumDatapoints:20,
51+
CollectionIntervalSeconds:int32(a.Config.CollectionInterval.Seconds()),
52+
NumDatapoints:a.Config.NumDatapoints,
3853
},
3954
Memory:func()*proto.GetResourcesMonitoringConfigurationResponse_Memory {
4055
ifmemoryErr!=nil {
@@ -60,8 +75,182 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context
6075
}
6176

6277
func (a*ResourcesMonitoringAPI)PushResourcesMonitoringUsage(ctx context.Context,req*proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse,error) {
63-
a.Log.Info(ctx,"resources monitoring usage received",
64-
slog.F("request",req))
78+
varerrerror
79+
80+
ifmemoryErr:=a.monitorMemory(ctx,req.Datapoints);memoryErr!=nil {
81+
err=errors.Join(err,xerrors.Errorf("monitor memory: %w",memoryErr))
82+
}
83+
84+
ifvolumeErr:=a.monitorVolumes(ctx,req.Datapoints);volumeErr!=nil {
85+
err=errors.Join(err,xerrors.Errorf("monitor volume: %w",volumeErr))
86+
}
87+
88+
return&proto.PushResourcesMonitoringUsageResponse{},err
89+
}
90+
91+
func (a*ResourcesMonitoringAPI)monitorMemory(ctx context.Context,datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint)error {
92+
monitor,err:=a.Database.FetchMemoryResourceMonitorsByAgentID(ctx,a.AgentID)
93+
iferr!=nil {
94+
// It is valid for an agent to not have a memory monitor, so we
95+
// do not want to treat it as an error.
96+
iferrors.Is(err,sql.ErrNoRows) {
97+
returnnil
98+
}
99+
100+
returnxerrors.Errorf("fetch memory resource monitor: %w",err)
101+
}
102+
103+
if!monitor.Enabled {
104+
returnnil
105+
}
106+
107+
usageDatapoints:=make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage,0,len(datapoints))
108+
for_,datapoint:=rangedatapoints {
109+
usageDatapoints=append(usageDatapoints,datapoint.Memory)
110+
}
111+
112+
usageStates:=resourcesmonitor.CalculateMemoryUsageStates(monitor,usageDatapoints)
113+
114+
oldState:=monitor.State
115+
newState:=resourcesmonitor.NextState(a.Config,oldState,usageStates)
116+
117+
debouncedUntil,shouldNotify:=monitor.Debounce(a.Debounce,a.Clock.Now(),oldState,newState)
118+
119+
//nolint:gocritic // We need to be able to update the resource monitor here.
120+
err=a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{
121+
AgentID:a.AgentID,
122+
State:newState,
123+
UpdatedAt:dbtime.Time(a.Clock.Now()),
124+
DebouncedUntil:dbtime.Time(debouncedUntil),
125+
})
126+
iferr!=nil {
127+
returnxerrors.Errorf("update workspace monitor: %w",err)
128+
}
129+
130+
if!shouldNotify {
131+
returnnil
132+
}
133+
134+
workspace,err:=a.Database.GetWorkspaceByID(ctx,a.WorkspaceID)
135+
iferr!=nil {
136+
returnxerrors.Errorf("get workspace by id: %w",err)
137+
}
138+
139+
_,err=a.NotificationsEnqueuer.EnqueueWithData(
140+
// nolint:gocritic // We need to be able to send the notification.
141+
dbauthz.AsNotifier(ctx),
142+
workspace.OwnerID,
143+
notifications.TemplateWorkspaceOutOfMemory,
144+
map[string]string{
145+
"workspace":workspace.Name,
146+
"threshold":fmt.Sprintf("%d%%",monitor.Threshold),
147+
},
148+
map[string]any{
149+
// NOTE(DanielleMaywood):
150+
// When notifications are enqueued, they are checked to be
151+
// unique within a single day. This means that if we attempt
152+
// to send two OOM notifications for the same workspace on
153+
// the same day, the enqueuer will prevent us from sending
154+
// a second one. We are inject a timestamp to make the
155+
// notifications appear different enough to circumvent this
156+
// deduplication logic.
157+
"timestamp":a.Clock.Now(),
158+
},
159+
"workspace-monitor-memory",
160+
)
161+
iferr!=nil {
162+
returnxerrors.Errorf("notify workspace OOM: %w",err)
163+
}
164+
165+
returnnil
166+
}
167+
168+
func (a*ResourcesMonitoringAPI)monitorVolumes(ctx context.Context,datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint)error {
169+
volumeMonitors,err:=a.Database.FetchVolumesResourceMonitorsByAgentID(ctx,a.AgentID)
170+
iferr!=nil {
171+
returnxerrors.Errorf("get or insert volume monitor: %w",err)
172+
}
173+
174+
outOfDiskVolumes:=make([]map[string]any,0)
175+
176+
for_,monitor:=rangevolumeMonitors {
177+
if!monitor.Enabled {
178+
continue
179+
}
180+
181+
usageDatapoints:=make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage,0,len(datapoints))
182+
for_,datapoint:=rangedatapoints {
183+
varusage*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage
184+
185+
for_,volume:=rangedatapoint.Volumes {
186+
ifvolume.Volume==monitor.Path {
187+
usage=volume
188+
break
189+
}
190+
}
191+
192+
usageDatapoints=append(usageDatapoints,usage)
193+
}
194+
195+
usageStates:=resourcesmonitor.CalculateVolumeUsageStates(monitor,usageDatapoints)
196+
197+
oldState:=monitor.State
198+
newState:=resourcesmonitor.NextState(a.Config,oldState,usageStates)
199+
200+
debouncedUntil,shouldNotify:=monitor.Debounce(a.Debounce,a.Clock.Now(),oldState,newState)
201+
202+
ifshouldNotify {
203+
outOfDiskVolumes=append(outOfDiskVolumes,map[string]any{
204+
"path":monitor.Path,
205+
"threshold":fmt.Sprintf("%d%%",monitor.Threshold),
206+
})
207+
}
208+
209+
//nolint:gocritic // We need to be able to update the resource monitor here.
210+
iferr:=a.Database.UpdateVolumeResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateVolumeResourceMonitorParams{
211+
AgentID:a.AgentID,
212+
Path:monitor.Path,
213+
State:newState,
214+
UpdatedAt:dbtime.Time(a.Clock.Now()),
215+
DebouncedUntil:dbtime.Time(debouncedUntil),
216+
});err!=nil {
217+
returnxerrors.Errorf("update workspace monitor: %w",err)
218+
}
219+
}
220+
221+
iflen(outOfDiskVolumes)==0 {
222+
returnnil
223+
}
224+
225+
workspace,err:=a.Database.GetWorkspaceByID(ctx,a.WorkspaceID)
226+
iferr!=nil {
227+
returnxerrors.Errorf("get workspace by id: %w",err)
228+
}
229+
230+
if_,err:=a.NotificationsEnqueuer.EnqueueWithData(
231+
// nolint:gocritic // We need to be able to send the notification.
232+
dbauthz.AsNotifier(ctx),
233+
workspace.OwnerID,
234+
notifications.TemplateWorkspaceOutOfDisk,
235+
map[string]string{
236+
"workspace":workspace.Name,
237+
},
238+
map[string]any{
239+
"volumes":outOfDiskVolumes,
240+
// NOTE(DanielleMaywood):
241+
// When notifications are enqueued, they are checked to be
242+
// unique within a single day. This means that if we attempt
243+
// to send two OOM notifications for the same workspace on
244+
// the same day, the enqueuer will prevent us from sending
245+
// a second one. We are inject a timestamp to make the
246+
// notifications appear different enough to circumvent this
247+
// deduplication logic.
248+
"timestamp":a.Clock.Now(),
249+
},
250+
"workspace-monitor-volumes",
251+
);err!=nil {
252+
returnxerrors.Errorf("notify workspace OOD: %w",err)
253+
}
65254

66-
return&proto.PushResourcesMonitoringUsageResponse{},nil
255+
returnnil
67256
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp