Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitae522c5

Browse files
feat: add agent timings (#14713)
* feat: begin impl of agent script timings* feat: add job_id and display_name to script timings* fix: increment migration number* fix: rename migrations from 251 to 254* test: get tests compiling* fix: appease the linter* fix: get tests passing again* fix: drop column from correct table* test: add fixture for agent script timings* fix: typo* fix: use job id used in provisioner job timings* fix: increment migration number* test: behaviour of script runner* test: rewrite test* test: does exit 1 script break things?* test: rewrite test again* fix: revert changeNot sure how this came to be, I do not recall manually changingthese files.* fix: let code breathe* fix: wrap errors* fix: justify nolint* fix: swap require.Equal argument order* fix: add mutex operations* feat: add 'ran_on_start' and 'blocked_login' fields* fix: update testdata fixture* fix: refer to agent_id instead of job_id in timings* fix: JobID -> AgentID in dbauthz_test* fix: add 'id' to scripts, make timing refer to script id* fix: fix broken tests and convert bug* fix: update testdata fixtures* fix: update testdata fixtures again* feat: capture stage and if script timed out* fix: update migration number* test: add test for script api* fix: fake db query* fix: use UTC time* fix: ensure r.scriptComplete is not nil* fix: move err check to right after call* fix: uppercase sql* fix: use dbtime.Now()* fix: debug log on r.scriptCompleted being nil* fix: ensure correct rbac permissions* chore: remove DisplayName* fix: get tests passing* fix: remove space in sql up* docs: document ExecuteOption* fix: drop 'RETURNING' from sql* chore: remove 'display_name' from timing table* fix: testdata fixture* fix: put r.scriptCompleted call in goroutine* fix: track goroutine for test + use separate context for reporting* fix: appease linter, handle trackCommandGoroutine error* fix: resolve race condition* feat: replace timed_out column with status column* test: update testdata fixture* fix: apply suggestions from review* revert: linter changes
1 parentb894407 commitae522c5

File tree

43 files changed

+1367
-232
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1367
-232
lines changed

‎agent/agent.go

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -941,17 +941,15 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
941941
}
942942
}
943943

944-
err=a.scriptRunner.Init(manifest.Scripts)
944+
err=a.scriptRunner.Init(manifest.Scripts,aAPI.ScriptCompleted)
945945
iferr!=nil {
946946
returnxerrors.Errorf("init script runner: %w",err)
947947
}
948948
err=a.trackGoroutine(func() {
949949
start:=time.Now()
950950
// here we use the graceful context because the script runner is not directly tied
951951
// to the agent API.
952-
err:=a.scriptRunner.Execute(a.gracefulCtx,func(script codersdk.WorkspaceAgentScript)bool {
953-
returnscript.RunOnStart
954-
})
952+
err:=a.scriptRunner.Execute(a.gracefulCtx,agentscripts.ExecuteStartScripts)
955953
// Measure the time immediately after the script has finished
956954
dur:=time.Since(start).Seconds()
957955
iferr!=nil {
@@ -1844,9 +1842,7 @@ func (a *agent) Close() error {
18441842
a.gracefulCancel()
18451843

18461844
lifecycleState:=codersdk.WorkspaceAgentLifecycleOff
1847-
err=a.scriptRunner.Execute(a.hardCtx,func(script codersdk.WorkspaceAgentScript)bool {
1848-
returnscript.RunOnStop
1849-
})
1845+
err=a.scriptRunner.Execute(a.hardCtx,agentscripts.ExecuteStopScripts)
18501846
iferr!=nil {
18511847
a.logger.Warn(a.hardCtx,"shutdown script(s) failed",slog.Error(err))
18521848
iferrors.Is(err,agentscripts.ErrTimeout) {

‎agent/agent_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,10 +1517,12 @@ func TestAgent_Lifecycle(t *testing.T) {
15171517
agentsdk.Manifest{
15181518
DERPMap:derpMap,
15191519
Scripts: []codersdk.WorkspaceAgentScript{{
1520+
ID:uuid.New(),
15201521
LogPath:"coder-startup-script.log",
15211522
Script:"echo 1",
15221523
RunOnStart:true,
15231524
}, {
1525+
ID:uuid.New(),
15241526
LogPath:"coder-shutdown-script.log",
15251527
Script:"echo "+expected,
15261528
RunOnStop:true,

‎agent/agentscripts/agentscripts.go

Lines changed: 97 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@ import (
1919
"github.com/spf13/afero"
2020
"golang.org/x/sync/errgroup"
2121
"golang.org/x/xerrors"
22+
"google.golang.org/protobuf/types/known/timestamppb"
2223

2324
"cdr.dev/slog"
2425

2526
"github.com/coder/coder/v2/agent/agentssh"
27+
"github.com/coder/coder/v2/agent/proto"
28+
"github.com/coder/coder/v2/coderd/database/dbtime"
2629
"github.com/coder/coder/v2/codersdk"
2730
"github.com/coder/coder/v2/codersdk/agentsdk"
2831
)
@@ -75,18 +78,21 @@ func New(opts Options) *Runner {
7578
}
7679
}
7780

81+
typeScriptCompletedFuncfunc(context.Context,*proto.WorkspaceAgentScriptCompletedRequest) (*proto.WorkspaceAgentScriptCompletedResponse,error)
82+
7883
typeRunnerstruct {
7984
Options
8085

81-
cronCtx context.Context
82-
cronCtxCancel context.CancelFunc
83-
cmdCloseWait sync.WaitGroup
84-
closedchanstruct{}
85-
closeMutex sync.Mutex
86-
cron*cron.Cron
87-
initialized atomic.Bool
88-
scripts []codersdk.WorkspaceAgentScript
89-
dataDirstring
86+
cronCtx context.Context
87+
cronCtxCancel context.CancelFunc
88+
cmdCloseWait sync.WaitGroup
89+
closedchanstruct{}
90+
closeMutex sync.Mutex
91+
cron*cron.Cron
92+
initialized atomic.Bool
93+
scripts []codersdk.WorkspaceAgentScript
94+
dataDirstring
95+
scriptCompletedScriptCompletedFunc
9096

9197
// scriptsExecuted includes all scripts executed by the workspace agent. Agents
9298
// execute startup scripts, and scripts on a cron schedule. Both will increment
@@ -116,12 +122,13 @@ func (r *Runner) RegisterMetrics(reg prometheus.Registerer) {
116122
// Init initializes the runner with the provided scripts.
117123
// It also schedules any scripts that have a schedule.
118124
// This function must be called before Execute.
119-
func (r*Runner)Init(scripts []codersdk.WorkspaceAgentScript)error {
125+
func (r*Runner)Init(scripts []codersdk.WorkspaceAgentScript,scriptCompletedScriptCompletedFunc)error {
120126
ifr.initialized.Load() {
121127
returnxerrors.New("init: already initialized")
122128
}
123129
r.initialized.Store(true)
124130
r.scripts=scripts
131+
r.scriptCompleted=scriptCompleted
125132
r.Logger.Info(r.cronCtx,"initializing agent scripts",slog.F("script_count",len(scripts)),slog.F("log_dir",r.LogDir))
126133

127134
err:=r.Filesystem.MkdirAll(r.ScriptBinDir(),0o700)
@@ -135,7 +142,7 @@ func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
135142
}
136143
script:=script
137144
_,err:=r.cron.AddFunc(script.Cron,func() {
138-
err:=r.trackRun(r.cronCtx,script)
145+
err:=r.trackRun(r.cronCtx,script,ExecuteCronScripts)
139146
iferr!=nil {
140147
r.Logger.Warn(context.Background(),"run agent script on schedule",slog.Error(err))
141148
}
@@ -172,22 +179,33 @@ func (r *Runner) StartCron() {
172179
}
173180
}
174181

182+
// ExecuteOption describes what scripts we want to execute.
183+
typeExecuteOptionint
184+
185+
// ExecuteOption enums.
186+
const (
187+
ExecuteAllScriptsExecuteOption=iota
188+
ExecuteStartScripts
189+
ExecuteStopScripts
190+
ExecuteCronScripts
191+
)
192+
175193
// Execute runs a set of scripts according to a filter.
176-
func (r*Runner)Execute(ctx context.Context,filterfunc(script codersdk.WorkspaceAgentScript)bool)error {
177-
iffilter==nil {
178-
// Execute em' all!
179-
filter=func(script codersdk.WorkspaceAgentScript)bool {
180-
returntrue
181-
}
182-
}
194+
func (r*Runner)Execute(ctx context.Context,optionExecuteOption)error {
183195
vareg errgroup.Group
184196
for_,script:=ranger.scripts {
185-
if!filter(script) {
197+
runScript:= (option==ExecuteStartScripts&&script.RunOnStart)||
198+
(option==ExecuteStopScripts&&script.RunOnStop)||
199+
(option==ExecuteCronScripts&&script.Cron!="")||
200+
option==ExecuteAllScripts
201+
202+
if!runScript {
186203
continue
187204
}
205+
188206
script:=script
189207
eg.Go(func()error {
190-
err:=r.trackRun(ctx,script)
208+
err:=r.trackRun(ctx,script,option)
191209
iferr!=nil {
192210
returnxerrors.Errorf("run agent script %q: %w",script.LogSourceID,err)
193211
}
@@ -198,8 +216,8 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp
198216
}
199217

200218
// trackRun wraps "run" with metrics.
201-
func (r*Runner)trackRun(ctx context.Context,script codersdk.WorkspaceAgentScript)error {
202-
err:=r.run(ctx,script)
219+
func (r*Runner)trackRun(ctx context.Context,script codersdk.WorkspaceAgentScript,optionExecuteOption)error {
220+
err:=r.run(ctx,script,option)
203221
iferr!=nil {
204222
r.scriptsExecuted.WithLabelValues("false").Add(1)
205223
}else {
@@ -212,7 +230,7 @@ func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScr
212230
// If the timeout is exceeded, the process is sent an interrupt signal.
213231
// If the process does not exit after a few seconds, it is forcefully killed.
214232
// This function immediately returns after a timeout, and does not wait for the process to exit.
215-
func (r*Runner)run(ctx context.Context,script codersdk.WorkspaceAgentScript)error {
233+
func (r*Runner)run(ctx context.Context,script codersdk.WorkspaceAgentScript,optionExecuteOption)error {
216234
logPath:=script.LogPath
217235
iflogPath=="" {
218236
logPath=fmt.Sprintf("coder-script-%s.log",script.LogSourceID)
@@ -299,9 +317,9 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
299317
cmd.Stdout=io.MultiWriter(fileWriter,infoW)
300318
cmd.Stderr=io.MultiWriter(fileWriter,errW)
301319

302-
start:=time.Now()
320+
start:=dbtime.Now()
303321
deferfunc() {
304-
end:=time.Now()
322+
end:=dbtime.Now()
305323
execTime:=end.Sub(start)
306324
exitCode:=0
307325
iferr!=nil {
@@ -314,6 +332,60 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
314332
}else {
315333
logger.Info(ctx,fmt.Sprintf("%s script completed",logPath),slog.F("execution_time",execTime),slog.F("exit_code",exitCode))
316334
}
335+
336+
ifr.scriptCompleted==nil {
337+
logger.Debug(ctx,"r.scriptCompleted unexpectedly nil")
338+
return
339+
}
340+
341+
// We want to check this outside of the goroutine to avoid a race condition
342+
timedOut:=errors.Is(err,ErrTimeout)
343+
pipesLeftOpen:=errors.Is(err,ErrOutputPipesOpen)
344+
345+
err=r.trackCommandGoroutine(func() {
346+
varstage proto.Timing_Stage
347+
switchoption {
348+
caseExecuteStartScripts:
349+
stage=proto.Timing_START
350+
caseExecuteStopScripts:
351+
stage=proto.Timing_STOP
352+
caseExecuteCronScripts:
353+
stage=proto.Timing_CRON
354+
}
355+
356+
varstatus proto.Timing_Status
357+
switch {
358+
casetimedOut:
359+
status=proto.Timing_TIMED_OUT
360+
casepipesLeftOpen:
361+
status=proto.Timing_PIPES_LEFT_OPEN
362+
caseexitCode!=0:
363+
status=proto.Timing_EXIT_FAILURE
364+
default:
365+
status=proto.Timing_OK
366+
}
367+
368+
reportTimeout:=30*time.Second
369+
reportCtx,cancel:=context.WithTimeout(context.Background(),reportTimeout)
370+
defercancel()
371+
372+
_,err:=r.scriptCompleted(reportCtx,&proto.WorkspaceAgentScriptCompletedRequest{
373+
Timing:&proto.Timing{
374+
ScriptId:script.ID[:],
375+
Start:timestamppb.New(start),
376+
End:timestamppb.New(end),
377+
ExitCode:int32(exitCode),
378+
Stage:stage,
379+
Status:status,
380+
},
381+
})
382+
iferr!=nil {
383+
logger.Error(ctx,fmt.Sprintf("reporting script completed: %s",err.Error()))
384+
}
385+
})
386+
iferr!=nil {
387+
logger.Error(ctx,fmt.Sprintf("reporting script completed: track command goroutine: %s",err.Error()))
388+
}
317389
}()
318390

319391
err=cmd.Start()

‎agent/agentscripts/agentscripts_test.go

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"cdr.dev/slog/sloggers/slogtest"
1818
"github.com/coder/coder/v2/agent/agentscripts"
1919
"github.com/coder/coder/v2/agent/agentssh"
20+
"github.com/coder/coder/v2/agent/agenttest"
2021
"github.com/coder/coder/v2/codersdk"
2122
"github.com/coder/coder/v2/codersdk/agentsdk"
2223
"github.com/coder/coder/v2/testutil"
@@ -34,14 +35,13 @@ func TestExecuteBasic(t *testing.T) {
3435
returnfLogger
3536
})
3637
deferrunner.Close()
38+
aAPI:=agenttest.NewFakeAgentAPI(t,slogtest.Make(t,nil),nil,nil)
3739
err:=runner.Init([]codersdk.WorkspaceAgentScript{{
3840
LogSourceID:uuid.New(),
3941
Script:"echo hello",
40-
}})
42+
}},aAPI.ScriptCompleted)
4143
require.NoError(t,err)
42-
require.NoError(t,runner.Execute(context.Background(),func(script codersdk.WorkspaceAgentScript)bool {
43-
returntrue
44-
}))
44+
require.NoError(t,runner.Execute(context.Background(),agentscripts.ExecuteAllScripts))
4545
log:=testutil.RequireRecvCtx(ctx,t,fLogger.logs)
4646
require.Equal(t,"hello",log.Output)
4747
}
@@ -61,18 +61,17 @@ func TestEnv(t *testing.T) {
6161
cmd.exe /c echo %CODER_SCRIPT_BIN_DIR%
6262
`
6363
}
64+
aAPI:=agenttest.NewFakeAgentAPI(t,slogtest.Make(t,nil),nil,nil)
6465
err:=runner.Init([]codersdk.WorkspaceAgentScript{{
6566
LogSourceID:id,
6667
Script:script,
67-
}})
68+
}},aAPI.ScriptCompleted)
6869
require.NoError(t,err)
6970

7071
ctx:=testutil.Context(t,testutil.WaitLong)
7172

7273
done:=testutil.Go(t,func() {
73-
err:=runner.Execute(ctx,func(script codersdk.WorkspaceAgentScript)bool {
74-
returntrue
75-
})
74+
err:=runner.Execute(ctx,agentscripts.ExecuteAllScripts)
7675
assert.NoError(t,err)
7776
})
7877
deferfunc() {
@@ -103,13 +102,44 @@ func TestTimeout(t *testing.T) {
103102
t.Parallel()
104103
runner:=setup(t,nil)
105104
deferrunner.Close()
105+
aAPI:=agenttest.NewFakeAgentAPI(t,slogtest.Make(t,nil),nil,nil)
106106
err:=runner.Init([]codersdk.WorkspaceAgentScript{{
107107
LogSourceID:uuid.New(),
108108
Script:"sleep infinity",
109109
Timeout:time.Millisecond,
110-
}})
110+
}},aAPI.ScriptCompleted)
111111
require.NoError(t,err)
112-
require.ErrorIs(t,runner.Execute(context.Background(),nil),agentscripts.ErrTimeout)
112+
require.ErrorIs(t,runner.Execute(context.Background(),agentscripts.ExecuteAllScripts),agentscripts.ErrTimeout)
113+
}
114+
115+
funcTestScriptReportsTiming(t*testing.T) {
116+
t.Parallel()
117+
118+
ctx:=testutil.Context(t,testutil.WaitShort)
119+
fLogger:=newFakeScriptLogger()
120+
runner:=setup(t,func(uuid2 uuid.UUID) agentscripts.ScriptLogger {
121+
returnfLogger
122+
})
123+
124+
aAPI:=agenttest.NewFakeAgentAPI(t,slogtest.Make(t,nil),nil,nil)
125+
err:=runner.Init([]codersdk.WorkspaceAgentScript{{
126+
DisplayName:"say-hello",
127+
LogSourceID:uuid.New(),
128+
Script:"echo hello",
129+
}},aAPI.ScriptCompleted)
130+
require.NoError(t,err)
131+
require.NoError(t,runner.Execute(ctx,agentscripts.ExecuteAllScripts))
132+
runner.Close()
133+
134+
log:=testutil.RequireRecvCtx(ctx,t,fLogger.logs)
135+
require.Equal(t,"hello",log.Output)
136+
137+
timings:=aAPI.GetTimings()
138+
require.Equal(t,1,len(timings))
139+
140+
timing:=timings[0]
141+
require.Equal(t,int32(0),timing.ExitCode)
142+
require.GreaterOrEqual(t,timing.End.AsTime(),timing.Start.AsTime())
113143
}
114144

115145
// TestCronClose exists because cron.Run() can happen after cron.Close().

‎agent/agenttest/client.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ type FakeAgentAPI struct {
170170
logsChchan<-*agentproto.BatchCreateLogsRequest
171171
lifecycleStates []codersdk.WorkspaceAgentLifecycle
172172
metadatamap[string]agentsdk.Metadata
173+
timings []*agentproto.Timing
173174

174175
getAnnouncementBannersFuncfunc() ([]codersdk.BannerConfig,error)
175176
}
@@ -182,6 +183,12 @@ func (*FakeAgentAPI) GetServiceBanner(context.Context, *agentproto.GetServiceBan
182183
return&agentproto.ServiceBanner{},nil
183184
}
184185

186+
func (f*FakeAgentAPI)GetTimings() []*agentproto.Timing {
187+
f.Lock()
188+
deferf.Unlock()
189+
returnslices.Clone(f.timings)
190+
}
191+
185192
func (f*FakeAgentAPI)SetAnnouncementBannersFunc(fnfunc() ([]codersdk.BannerConfig,error)) {
186193
f.Lock()
187194
deferf.Unlock()
@@ -301,6 +308,14 @@ func (f *FakeAgentAPI) BatchCreateLogs(ctx context.Context, req *agentproto.Batc
301308
return&agentproto.BatchCreateLogsResponse{},nil
302309
}
303310

311+
func (f*FakeAgentAPI)ScriptCompleted(_ context.Context,req*agentproto.WorkspaceAgentScriptCompletedRequest) (*agentproto.WorkspaceAgentScriptCompletedResponse,error) {
312+
f.Lock()
313+
f.timings=append(f.timings,req.Timing)
314+
f.Unlock()
315+
316+
return&agentproto.WorkspaceAgentScriptCompletedResponse{},nil
317+
}
318+
304319
funcNewFakeAgentAPI(t testing.TB,logger slog.Logger,manifest*agentproto.Manifest,statsChchan*agentproto.Stats)*FakeAgentAPI {
305320
return&FakeAgentAPI{
306321
t:t,

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp