Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit8dac035

Browse files
authored
refactor: replace startup script logs EOF with starting/ready time (#8082)
This commit reverts some of the changes in#8029 and implements analternative method of keeping track of when the startup script has endedand there will be no more logs.This is achieved by adding new agent fields for tracking when the agententers the "starting" and "ready"/"start_error" lifecycle states. Thetimestamps simplify logic since we don't need understand if the currentstate is before or after the state we're interested in. They can also beused to show data like how long the startup script took to execute. Thisalso allowed us to remove the EOF field from the logs as theimplementation was problematic when we returned the EOF log entry in theresponse since requesting _after_ that ID would give no logs and the APIwould thus lose track of EOF.
1 parentb1d1b63 commit8dac035

29 files changed

+462
-540
lines changed

‎agent/agent.go

Lines changed: 73 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"net/http"
1515
"net/netip"
1616
"os"
17+
"os/exec"
1718
"os/user"
1819
"path/filepath"
1920
"sort"
@@ -126,6 +127,7 @@ func New(options Options) Agent {
126127
tempDir:options.TempDir,
127128
lifecycleUpdate:make(chanstruct{},1),
128129
lifecycleReported:make(chan codersdk.WorkspaceAgentLifecycle,1),
130+
lifecycleStates: []agentsdk.PostLifecycleRequest{{State:codersdk.WorkspaceAgentLifecycleCreated}},
129131
ignorePorts:options.IgnorePorts,
130132
connStatsChan:make(chan*agentsdk.Stats,1),
131133
sshMaxTimeout:options.SSHMaxTimeout,
@@ -170,7 +172,7 @@ type agent struct {
170172
lifecycleUpdatechanstruct{}
171173
lifecycleReportedchan codersdk.WorkspaceAgentLifecycle
172174
lifecycleMu sync.RWMutex// Protects following.
173-
lifecycleState codersdk.WorkspaceAgentLifecycle
175+
lifecycleStates[]agentsdk.PostLifecycleRequest
174176

175177
network*tailnet.Conn
176178
connStatsChanchan*agentsdk.Stats
@@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
414416
}
415417
}
416418

417-
// reportLifecycleLoop reports the current lifecycle state once.
418-
// Only the latest state is reported, intermediate states may be
419-
// lost if the agent can't communicate with the API.
419+
// reportLifecycleLoop reports the current lifecycle state once. All state
420+
// changes are reported in order.
420421
func (a*agent)reportLifecycleLoop(ctx context.Context) {
421-
varlastReported codersdk.WorkspaceAgentLifecycle
422+
lastReportedIndex:=0// Start off with the created state without reporting it.
422423
for {
423424
select {
424425
case<-a.lifecycleUpdate:
@@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
428429

429430
forr:=retry.New(time.Second,15*time.Second);r.Wait(ctx); {
430431
a.lifecycleMu.RLock()
431-
state:=a.lifecycleState
432+
lastIndex:=len(a.lifecycleStates)-1
433+
report:=a.lifecycleStates[lastReportedIndex]
434+
iflen(a.lifecycleStates)>lastReportedIndex+1 {
435+
report=a.lifecycleStates[lastReportedIndex+1]
436+
}
432437
a.lifecycleMu.RUnlock()
433438

434-
ifstate==lastReported {
439+
iflastIndex==lastReportedIndex {
435440
break
436441
}
437442

438-
a.logger.Debug(ctx,"reporting lifecycle state",slog.F("state",state))
443+
a.logger.Debug(ctx,"reporting lifecycle state",slog.F("payload",report))
439444

440-
err:=a.client.PostLifecycle(ctx, agentsdk.PostLifecycleRequest{
441-
State:state,
442-
})
445+
err:=a.client.PostLifecycle(ctx,report)
443446
iferr==nil {
444-
lastReported=state
447+
lastReportedIndex++
445448
select {
446-
casea.lifecycleReported<-state:
449+
casea.lifecycleReported<-report.State:
447450
case<-a.lifecycleReported:
448-
a.lifecycleReported<-state
451+
a.lifecycleReported<-report.State
452+
}
453+
iflastReportedIndex<lastIndex {
454+
// Keep reporting until we've sent all messages, we can't
455+
// rely on the channel triggering us before the backlog is
456+
// consumed.
457+
continue
449458
}
450459
break
451460
}
@@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
461470
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
462471
// The state is only updated if it's a valid state transition.
463472
func (a*agent)setLifecycle(ctx context.Context,state codersdk.WorkspaceAgentLifecycle) {
473+
report:= agentsdk.PostLifecycleRequest{
474+
State:state,
475+
ChangedAt:database.Now(),
476+
}
477+
464478
a.lifecycleMu.Lock()
465-
lastState:=a.lifecycleState
466-
ifslices.Index(codersdk.WorkspaceAgentLifecycleOrder,lastState)>slices.Index(codersdk.WorkspaceAgentLifecycleOrder,state) {
467-
a.logger.Warn(ctx,"attempted to set lifecycle state to a previous state",slog.F("last",lastState),slog.F("state",state))
479+
lastReport:=a.lifecycleStates[len(a.lifecycleStates)-1]
480+
ifslices.Index(codersdk.WorkspaceAgentLifecycleOrder,lastReport.State)>=slices.Index(codersdk.WorkspaceAgentLifecycleOrder,report.State) {
481+
a.logger.Warn(ctx,"attempted to set lifecycle state to a previous state",slog.F("last",lastReport),slog.F("current",report))
468482
a.lifecycleMu.Unlock()
469483
return
470484
}
471-
a.lifecycleState=state
472-
a.logger.Debug(ctx,"set lifecycle state",slog.F("state",state),slog.F("last",lastState))
485+
a.lifecycleStates=append(a.lifecycleStates,report)
486+
a.logger.Debug(ctx,"set lifecycle state",slog.F("current",report),slog.F("last",lastReport))
473487
a.lifecycleMu.Unlock()
474488

475489
select {
@@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error {
534548

535549
lifecycleState:=codersdk.WorkspaceAgentLifecycleReady
536550
scriptDone:=make(chanerror,1)
537-
scriptStart:=time.Now()
538551
err=a.trackConnGoroutine(func() {
539552
deferclose(scriptDone)
540553
scriptDone<-a.runStartupScript(ctx,manifest.StartupScript)
@@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error {
556569
select {
557570
caseerr=<-scriptDone:
558571
case<-timeout:
559-
a.logger.Warn(ctx,"startupscript timed out")
572+
a.logger.Warn(ctx,"script timed out",slog.F("lifecycle","startup"),slog.F("timeout",manifest.ShutdownScriptTimeout))
560573
a.setLifecycle(ctx,codersdk.WorkspaceAgentLifecycleStartTimeout)
561574
err=<-scriptDone// The script can still complete after a timeout.
562575
}
563-
iferrors.Is(err,context.Canceled) {
564-
return
565-
}
566-
// Only log if there was a startup script.
567-
ifmanifest.StartupScript!="" {
568-
execTime:=time.Since(scriptStart)
569-
iferr!=nil {
570-
a.logger.Warn(ctx,"startup script failed",slog.F("execution_time",execTime),slog.Error(err))
571-
lifecycleState=codersdk.WorkspaceAgentLifecycleStartError
572-
}else {
573-
a.logger.Info(ctx,"startup script completed",slog.F("execution_time",execTime))
576+
iferr!=nil {
577+
iferrors.Is(err,context.Canceled) {
578+
return
574579
}
580+
lifecycleState=codersdk.WorkspaceAgentLifecycleStartError
575581
}
576582
a.setLifecycle(ctx,lifecycleState)
577583
}()
@@ -830,20 +836,31 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error {
830836
returna.runScript(ctx,"shutdown",script)
831837
}
832838

833-
func (a*agent)runScript(ctx context.Context,lifecycle,scriptstring)error {
839+
func (a*agent)runScript(ctx context.Context,lifecycle,scriptstring)(errerror) {
834840
ifscript=="" {
835841
returnnil
836842
}
837843

838-
a.logger.Info(ctx,"running script",slog.F("lifecycle",lifecycle),slog.F("script",script))
844+
logger:=a.logger.With(slog.F("lifecycle",lifecycle))
845+
846+
logger.Info(ctx,fmt.Sprintf("running %s script",lifecycle),slog.F("script",script))
839847
fileWriter,err:=a.filesystem.OpenFile(filepath.Join(a.logDir,fmt.Sprintf("coder-%s-script.log",lifecycle)),os.O_CREATE|os.O_RDWR,0o600)
840848
iferr!=nil {
841849
returnxerrors.Errorf("open %s script log file: %w",lifecycle,err)
842850
}
843851
deferfunc() {
844-
_=fileWriter.Close()
852+
err:=fileWriter.Close()
853+
iferr!=nil {
854+
logger.Warn(ctx,fmt.Sprintf("close %s script log file",lifecycle),slog.Error(err))
855+
}
845856
}()
846857

858+
cmdPty,err:=a.sshServer.CreateCommand(ctx,script,nil)
859+
iferr!=nil {
860+
returnxerrors.Errorf("%s script: create command: %w",lifecycle,err)
861+
}
862+
cmd:=cmdPty.AsExec()
863+
847864
varwriter io.Writer=fileWriter
848865
iflifecycle=="startup" {
849866
// Create pipes for startup logs reader and writer
@@ -854,29 +871,42 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
854871
writer=io.MultiWriter(fileWriter,logsWriter)
855872
flushedLogs,err:=a.trackScriptLogs(ctx,logsReader)
856873
iferr!=nil {
857-
returnxerrors.Errorf("track script logs: %w",err)
874+
returnxerrors.Errorf("track%sscript logs: %w",lifecycle,err)
858875
}
859876
deferfunc() {
860877
_=logsWriter.Close()
861878
<-flushedLogs
862879
}()
863880
}
864881

865-
cmdPty,err:=a.sshServer.CreateCommand(ctx,script,nil)
866-
iferr!=nil {
867-
returnxerrors.Errorf("create command: %w",err)
868-
}
869-
cmd:=cmdPty.AsExec()
870882
cmd.Stdout=writer
871883
cmd.Stderr=writer
884+
885+
start:=time.Now()
886+
deferfunc() {
887+
end:=time.Now()
888+
execTime:=end.Sub(start)
889+
exitCode:=0
890+
iferr!=nil {
891+
exitCode=255// Unknown status.
892+
varexitError*exec.ExitError
893+
ifxerrors.As(err,&exitError) {
894+
exitCode=exitError.ExitCode()
895+
}
896+
logger.Warn(ctx,fmt.Sprintf("%s script failed",lifecycle),slog.F("execution_time",execTime),slog.F("exit_code",exitCode),slog.Error(err))
897+
}else {
898+
logger.Info(ctx,fmt.Sprintf("%s script completed",lifecycle),slog.F("execution_time",execTime),slog.F("exit_code",exitCode))
899+
}
900+
}()
901+
872902
err=cmd.Run()
873903
iferr!=nil {
874904
// cmd.Run does not return a context canceled error, it returns "signal: killed".
875905
ifctx.Err()!=nil {
876906
returnctx.Err()
877907
}
878908

879-
returnxerrors.Errorf("run: %w",err)
909+
returnxerrors.Errorf("%s script:run: %w",lifecycle,err)
880910
}
881911
returnnil
882912
}
@@ -929,11 +959,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
929959
iferrors.As(err,&sdkErr) {
930960
ifsdkErr.StatusCode()==http.StatusRequestEntityTooLarge {
931961
a.logger.Warn(ctx,"startup logs too large, dropping logs")
932-
// Always send the EOF even if logs overflow.
933-
iflen(logsToSend)>1&&logsToSend[len(logsToSend)-1].EOF {
934-
logsToSend=logsToSend[len(logsToSend)-1:]
935-
continue
936-
}
937962
break
938963
}
939964
}
@@ -986,11 +1011,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
9861011
iferr:=scanner.Err();err!=nil {
9871012
a.logger.Error(ctx,"scan startup logs",slog.Error(err))
9881013
}
989-
queueLog(agentsdk.StartupLog{
990-
CreatedAt:database.Now(),
991-
Output:"",
992-
EOF:true,
993-
})
9941014
deferclose(logsFinished)
9951015
logsFlushed.L.Lock()
9961016
for {
@@ -1392,7 +1412,6 @@ func (a *agent) Close() error {
13921412
lifecycleState:=codersdk.WorkspaceAgentLifecycleOff
13931413
ifmanifest:=a.manifest.Load();manifest!=nil&&manifest.ShutdownScript!="" {
13941414
scriptDone:=make(chanerror,1)
1395-
scriptStart:=time.Now()
13961415
gofunc() {
13971416
deferclose(scriptDone)
13981417
scriptDone<-a.runShutdownScript(ctx,manifest.ShutdownScript)
@@ -1411,16 +1430,12 @@ func (a *agent) Close() error {
14111430
select {
14121431
caseerr=<-scriptDone:
14131432
case<-timeout:
1414-
a.logger.Warn(ctx,"shutdownscript timed out")
1433+
a.logger.Warn(ctx,"script timed out",slog.F("lifecycle","shutdown"),slog.F("timeout",manifest.ShutdownScriptTimeout))
14151434
a.setLifecycle(ctx,codersdk.WorkspaceAgentLifecycleShutdownTimeout)
14161435
err=<-scriptDone// The script can still complete after a timeout.
14171436
}
1418-
execTime:=time.Since(scriptStart)
14191437
iferr!=nil {
1420-
a.logger.Warn(ctx,"shutdown script failed",slog.F("execution_time",execTime),slog.Error(err))
14211438
lifecycleState=codersdk.WorkspaceAgentLifecycleShutdownError
1422-
}else {
1423-
a.logger.Info(ctx,"shutdown script completed",slog.F("execution_time",execTime))
14241439
}
14251440
}
14261441

‎agent/agent_test.go

Lines changed: 9 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -905,10 +905,8 @@ func TestAgent_StartupScript(t *testing.T) {
905905
returnlen(got)>0&&got[len(got)-1]==codersdk.WorkspaceAgentLifecycleReady
906906
},testutil.WaitShort,testutil.IntervalMedium)
907907

908-
require.Len(t,client.getStartupLogs(),2)
908+
require.Len(t,client.getStartupLogs(),1)
909909
require.Equal(t,output,client.getStartupLogs()[0].Output)
910-
require.False(t,client.getStartupLogs()[0].EOF)
911-
require.True(t,client.getStartupLogs()[1].EOF)
912910
})
913911
// This ensures that even when coderd sends back that the startup
914912
// script has written too many lines it will still succeed!
@@ -1115,15 +1113,8 @@ func TestAgent_Lifecycle(t *testing.T) {
11151113
got=client.getLifecycleStates()
11161114
returnlen(got)>0&&got[len(got)-1]==want[len(want)-1]
11171115
},testutil.WaitShort,testutil.IntervalMedium)
1118-
switchlen(got) {
1119-
case1:
1120-
// This can happen if lifecycle state updates are
1121-
// too fast, only the latest one is reported.
1122-
require.Equal(t,want[1:],got)
1123-
default:
1124-
// This is the expected case.
1125-
require.Equal(t,want,got)
1126-
}
1116+
1117+
require.Equal(t,want,got)
11271118
})
11281119

11291120
t.Run("StartError",func(t*testing.T) {
@@ -1144,15 +1135,8 @@ func TestAgent_Lifecycle(t *testing.T) {
11441135
got=client.getLifecycleStates()
11451136
returnlen(got)>0&&got[len(got)-1]==want[len(want)-1]
11461137
},testutil.WaitShort,testutil.IntervalMedium)
1147-
switchlen(got) {
1148-
case1:
1149-
// This can happen if lifecycle state updates are
1150-
// too fast, only the latest one is reported.
1151-
require.Equal(t,want[1:],got)
1152-
default:
1153-
// This is the expected case.
1154-
require.Equal(t,want,got)
1155-
}
1138+
1139+
require.Equal(t,want,got)
11561140
})
11571141

11581142
t.Run("Ready",func(t*testing.T) {
@@ -1173,15 +1157,8 @@ func TestAgent_Lifecycle(t *testing.T) {
11731157
got=client.getLifecycleStates()
11741158
returnlen(got)>0&&got[len(got)-1]==want[len(want)-1]
11751159
},testutil.WaitShort,testutil.IntervalMedium)
1176-
switchlen(got) {
1177-
case1:
1178-
// This can happen if lifecycle state updates are
1179-
// too fast, only the latest one is reported.
1180-
require.Equal(t,want[1:],got)
1181-
default:
1182-
// This is the expected case.
1183-
require.Equal(t,want,got)
1184-
}
1160+
1161+
require.Equal(t,want,got)
11851162
})
11861163

11871164
t.Run("ShuttingDown",func(t*testing.T) {
@@ -1258,15 +1235,7 @@ func TestAgent_Lifecycle(t *testing.T) {
12581235
returnlen(got)>0&&got[len(got)-1]==want[len(want)-1]
12591236
},testutil.WaitShort,testutil.IntervalMedium)
12601237

1261-
switchlen(got) {
1262-
case1:
1263-
// This can happen if lifecycle state updates are
1264-
// too fast, only the latest one is reported.
1265-
require.Equal(t,want[1:],got)
1266-
default:
1267-
// This is the expected case.
1268-
require.Equal(t,want,got)
1269-
}
1238+
require.Equal(t,want,got)
12701239
})
12711240

12721241
t.Run("ShutdownError",func(t*testing.T) {
@@ -1305,15 +1274,7 @@ func TestAgent_Lifecycle(t *testing.T) {
13051274
returnlen(got)>0&&got[len(got)-1]==want[len(want)-1]
13061275
},testutil.WaitShort,testutil.IntervalMedium)
13071276

1308-
switchlen(got) {
1309-
case1:
1310-
// This can happen if lifecycle state updates are
1311-
// too fast, only the latest one is reported.
1312-
require.Equal(t,want[1:],got)
1313-
default:
1314-
// This is the expected case.
1315-
require.Equal(t,want,got)
1316-
}
1277+
require.Equal(t,want,got)
13171278
})
13181279

13191280
t.Run("ShutdownScriptOnce",func(t*testing.T) {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp