Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit1334344

Browse files
committed
refactor: Replace startup script logs EOF with starting/ready time
This commit reverts some of the changes in#8029 and implements analternative method of keeping track of when the startup script has endedand there will be no more logs.This is achieved by adding new agent fields for tracking when the agententers the "starting" and "ready"/"start_error" lifecycle states. Thetimestamps simplify logic since we don't need understand if the currentstate is before or after the state we're interested in. They can also beused to show data like how long the startup script took to execute. Thisalso allowed us to remove the EOF field from the logs as theimplementation was problematic when we returned the EOF log entry in theresponse since requesting _after_ that ID would give no logs and the APIwould thus lose track of EOF.
1 parent5b9c378 commit1334344

29 files changed

+462
-536
lines changed

‎agent/agent.go

Lines changed: 72 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"net/http"
1515
"net/netip"
1616
"os"
17+
"os/exec"
1718
"os/user"
1819
"path/filepath"
1920
"sort"
@@ -126,6 +127,7 @@ func New(options Options) Agent {
126127
tempDir:options.TempDir,
127128
lifecycleUpdate:make(chanstruct{},1),
128129
lifecycleReported:make(chan codersdk.WorkspaceAgentLifecycle,1),
130+
lifecycleStates: []agentsdk.PostLifecycleRequest{{State:codersdk.WorkspaceAgentLifecycleCreated}},
129131
ignorePorts:options.IgnorePorts,
130132
connStatsChan:make(chan*agentsdk.Stats,1),
131133
sshMaxTimeout:options.SSHMaxTimeout,
@@ -170,7 +172,7 @@ type agent struct {
170172
lifecycleUpdatechanstruct{}
171173
lifecycleReportedchan codersdk.WorkspaceAgentLifecycle
172174
lifecycleMu sync.RWMutex// Protects following.
173-
lifecycleState codersdk.WorkspaceAgentLifecycle
175+
lifecycleStates[]agentsdk.PostLifecycleRequest
174176

175177
network*tailnet.Conn
176178
connStatsChanchan*agentsdk.Stats
@@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
414416
}
415417
}
416418

417-
// reportLifecycleLoop reports the current lifecycle state once.
418-
// Only the latest state is reported, intermediate states may be
419-
// lost if the agent can't communicate with the API.
419+
// reportLifecycleLoop reports the current lifecycle state once. All state
420+
// changes are reported in order.
420421
func (a*agent)reportLifecycleLoop(ctx context.Context) {
421-
varlastReported codersdk.WorkspaceAgentLifecycle
422+
lastReportedIndex:=0// Start off with the created state without reporting it.
422423
for {
423424
select {
424425
case<-a.lifecycleUpdate:
@@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
428429

429430
forr:=retry.New(time.Second,15*time.Second);r.Wait(ctx); {
430431
a.lifecycleMu.RLock()
431-
state:=a.lifecycleState
432+
lastIndex:=len(a.lifecycleStates)-1
433+
report:=a.lifecycleStates[lastReportedIndex]
434+
iflen(a.lifecycleStates)>lastReportedIndex+1 {
435+
report=a.lifecycleStates[lastReportedIndex+1]
436+
}
432437
a.lifecycleMu.RUnlock()
433438

434-
ifstate==lastReported {
439+
iflastIndex==lastReportedIndex {
435440
break
436441
}
437442

438-
a.logger.Debug(ctx,"reporting lifecycle state",slog.F("state",state))
443+
a.logger.Debug(ctx,"reporting lifecycle state",slog.F("payload",report))
439444

440-
err:=a.client.PostLifecycle(ctx, agentsdk.PostLifecycleRequest{
441-
State:state,
442-
})
445+
err:=a.client.PostLifecycle(ctx,report)
443446
iferr==nil {
444-
lastReported=state
447+
lastReportedIndex++
445448
select {
446-
casea.lifecycleReported<-state:
449+
casea.lifecycleReported<-report.State:
447450
case<-a.lifecycleReported:
448-
a.lifecycleReported<-state
451+
a.lifecycleReported<-report.State
452+
}
453+
iflastReportedIndex<lastIndex {
454+
// Keep reporting until we've sent all messages, we can't
455+
// rely on the channel triggering us before the backlog is
456+
// consumed.
457+
continue
449458
}
450459
break
451460
}
@@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
461470
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
462471
// The state is only updated if it's a valid state transition.
463472
func (a*agent)setLifecycle(ctx context.Context,state codersdk.WorkspaceAgentLifecycle) {
473+
report:= agentsdk.PostLifecycleRequest{
474+
State:state,
475+
ChangedAt:database.Now(),
476+
}
477+
464478
a.lifecycleMu.Lock()
465-
lastState:=a.lifecycleState
466-
ifslices.Index(codersdk.WorkspaceAgentLifecycleOrder,lastState)>slices.Index(codersdk.WorkspaceAgentLifecycleOrder,state) {
467-
a.logger.Warn(ctx,"attempted to set lifecycle state to a previous state",slog.F("last",lastState),slog.F("state",state))
479+
lastReport:=a.lifecycleStates[len(a.lifecycleStates)-1]
480+
ifslices.Index(codersdk.WorkspaceAgentLifecycleOrder,lastReport.State)>=slices.Index(codersdk.WorkspaceAgentLifecycleOrder,report.State) {
481+
a.logger.Warn(ctx,"attempted to set lifecycle state to a previous state",slog.F("last",lastReport),slog.F("current",report))
468482
a.lifecycleMu.Unlock()
469483
return
470484
}
471-
a.lifecycleState=state
472-
a.logger.Debug(ctx,"set lifecycle state",slog.F("state",state),slog.F("last",lastState))
485+
a.lifecycleStates=append(a.lifecycleStates,report)
486+
a.logger.Debug(ctx,"set lifecycle state",slog.F("current",report),slog.F("last",lastReport))
473487
a.lifecycleMu.Unlock()
474488

475489
select {
@@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error {
534548

535549
lifecycleState:=codersdk.WorkspaceAgentLifecycleReady
536550
scriptDone:=make(chanerror,1)
537-
scriptStart:=time.Now()
538551
err=a.trackConnGoroutine(func() {
539552
deferclose(scriptDone)
540553
scriptDone<-a.runStartupScript(ctx,manifest.StartupScript)
@@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error {
556569
select {
557570
caseerr=<-scriptDone:
558571
case<-timeout:
559-
a.logger.Warn(ctx,"startupscript timed out")
572+
a.logger.Warn(ctx,"script timed out",slog.F("lifecycle","startup"),slog.F("timeout",manifest.ShutdownScriptTimeout))
560573
a.setLifecycle(ctx,codersdk.WorkspaceAgentLifecycleStartTimeout)
561574
err=<-scriptDone// The script can still complete after a timeout.
562575
}
563-
iferrors.Is(err,context.Canceled) {
564-
return
565-
}
566-
// Only log if there was a startup script.
567-
ifmanifest.StartupScript!="" {
568-
execTime:=time.Since(scriptStart)
569-
iferr!=nil {
570-
a.logger.Warn(ctx,"startup script failed",slog.F("execution_time",execTime),slog.Error(err))
571-
lifecycleState=codersdk.WorkspaceAgentLifecycleStartError
572-
}else {
573-
a.logger.Info(ctx,"startup script completed",slog.F("execution_time",execTime))
576+
iferr!=nil {
577+
iferrors.Is(err,context.Canceled) {
578+
return
574579
}
580+
lifecycleState=codersdk.WorkspaceAgentLifecycleStartError
575581
}
576582
a.setLifecycle(ctx,lifecycleState)
577583
}()
@@ -830,20 +836,32 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error {
830836
returna.runScript(ctx,"shutdown",script)
831837
}
832838

833-
func (a*agent)runScript(ctx context.Context,lifecycle,scriptstring)error {
839+
func (a*agent)runScript(ctx context.Context,lifecycle,scriptstring) (errerror) {
840+
logger:=a.logger.With(slog.F("lifecycle",lifecycle))
841+
834842
ifscript=="" {
843+
logger.Info(ctx,"not running empty script")
835844
returnnil
836845
}
837846

838-
a.logger.Info(ctx,"running script",slog.F("lifecycle",lifecycle),slog.F("script",script))
847+
logger.Info(ctx,"running script",slog.F("script",script))
839848
fileWriter,err:=a.filesystem.OpenFile(filepath.Join(a.logDir,fmt.Sprintf("coder-%s-script.log",lifecycle)),os.O_CREATE|os.O_RDWR,0o600)
840849
iferr!=nil {
841850
returnxerrors.Errorf("open %s script log file: %w",lifecycle,err)
842851
}
843852
deferfunc() {
844-
_=fileWriter.Close()
853+
err:=fileWriter.Close()
854+
iferr!=nil {
855+
logger.Warn(ctx,"close script log file",slog.Error(err))
856+
}
845857
}()
846858

859+
cmdPty,err:=a.sshServer.CreateCommand(ctx,script,nil)
860+
iferr!=nil {
861+
returnxerrors.Errorf("create command: %w",err)
862+
}
863+
cmd:=cmdPty.AsExec()
864+
847865
varwriter io.Writer=fileWriter
848866
iflifecycle=="startup" {
849867
// Create pipes for startup logs reader and writer
@@ -862,13 +880,26 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
862880
}()
863881
}
864882

865-
cmdPty,err:=a.sshServer.CreateCommand(ctx,script,nil)
866-
iferr!=nil {
867-
returnxerrors.Errorf("create command: %w",err)
868-
}
869-
cmd:=cmdPty.AsExec()
870883
cmd.Stdout=writer
871884
cmd.Stderr=writer
885+
886+
start:=time.Now()
887+
deferfunc() {
888+
end:=time.Now()
889+
execTime:=end.Sub(start)
890+
exitCode:=0
891+
iferr!=nil {
892+
exitCode=255// Unknown status.
893+
varexitError*exec.ExitError
894+
ifxerrors.As(err,&exitError) {
895+
exitCode=exitError.ExitCode()
896+
}
897+
logger.Warn(ctx,"script failed",slog.F("execution_time",execTime),slog.F("exit_code",exitCode),slog.Error(err))
898+
}else {
899+
logger.Info(ctx,"script completed",slog.F("execution_time",execTime),slog.F("exit_code",exitCode))
900+
}
901+
}()
902+
872903
err=cmd.Run()
873904
iferr!=nil {
874905
// cmd.Run does not return a context canceled error, it returns "signal: killed".
@@ -929,11 +960,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
929960
iferrors.As(err,&sdkErr) {
930961
ifsdkErr.StatusCode()==http.StatusRequestEntityTooLarge {
931962
a.logger.Warn(ctx,"startup logs too large, dropping logs")
932-
// Always send the EOF even if logs overflow.
933-
iflen(logsToSend)>1&&logsToSend[len(logsToSend)-1].EOF {
934-
logsToSend=logsToSend[len(logsToSend)-1:]
935-
continue
936-
}
937963
break
938964
}
939965
}
@@ -986,11 +1012,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
9861012
iferr:=scanner.Err();err!=nil {
9871013
a.logger.Error(ctx,"scan startup logs",slog.Error(err))
9881014
}
989-
queueLog(agentsdk.StartupLog{
990-
CreatedAt:database.Now(),
991-
Output:"",
992-
EOF:true,
993-
})
9941015
deferclose(logsFinished)
9951016
logsFlushed.L.Lock()
9961017
for {
@@ -1392,7 +1413,6 @@ func (a *agent) Close() error {
13921413
lifecycleState:=codersdk.WorkspaceAgentLifecycleOff
13931414
ifmanifest:=a.manifest.Load();manifest!=nil&&manifest.ShutdownScript!="" {
13941415
scriptDone:=make(chanerror,1)
1395-
scriptStart:=time.Now()
13961416
gofunc() {
13971417
deferclose(scriptDone)
13981418
scriptDone<-a.runShutdownScript(ctx,manifest.ShutdownScript)
@@ -1411,16 +1431,12 @@ func (a *agent) Close() error {
14111431
select {
14121432
caseerr=<-scriptDone:
14131433
case<-timeout:
1414-
a.logger.Warn(ctx,"shutdownscript timed out")
1434+
a.logger.Warn(ctx,"script timed out",slog.F("lifecycle","shutdown"),slog.F("timeout",manifest.ShutdownScriptTimeout))
14151435
a.setLifecycle(ctx,codersdk.WorkspaceAgentLifecycleShutdownTimeout)
14161436
err=<-scriptDone// The script can still complete after a timeout.
14171437
}
1418-
execTime:=time.Since(scriptStart)
14191438
iferr!=nil {
1420-
a.logger.Warn(ctx,"shutdown script failed",slog.F("execution_time",execTime),slog.Error(err))
14211439
lifecycleState=codersdk.WorkspaceAgentLifecycleShutdownError
1422-
}else {
1423-
a.logger.Info(ctx,"shutdown script completed",slog.F("execution_time",execTime))
14241440
}
14251441
}
14261442

‎agent/agent_test.go

Lines changed: 9 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -905,10 +905,8 @@ func TestAgent_StartupScript(t *testing.T) {
905905
returnlen(got)>0&&got[len(got)-1]==codersdk.WorkspaceAgentLifecycleReady
906906
},testutil.WaitShort,testutil.IntervalMedium)
907907

908-
require.Len(t,client.getStartupLogs(),2)
908+
require.Len(t,client.getStartupLogs(),1)
909909
require.Equal(t,output,client.getStartupLogs()[0].Output)
910-
require.False(t,client.getStartupLogs()[0].EOF)
911-
require.True(t,client.getStartupLogs()[1].EOF)
912910
})
913911
// This ensures that even when coderd sends back that the startup
914912
// script has written too many lines it will still succeed!
@@ -1115,15 +1113,8 @@ func TestAgent_Lifecycle(t *testing.T) {
11151113
got=client.getLifecycleStates()
11161114
returnlen(got)>0&&got[len(got)-1]==want[len(want)-1]
11171115
},testutil.WaitShort,testutil.IntervalMedium)
1118-
switchlen(got) {
1119-
case1:
1120-
// This can happen if lifecycle state updates are
1121-
// too fast, only the latest one is reported.
1122-
require.Equal(t,want[1:],got)
1123-
default:
1124-
// This is the expected case.
1125-
require.Equal(t,want,got)
1126-
}
1116+
1117+
require.Equal(t,want,got)
11271118
})
11281119

11291120
t.Run("StartError",func(t*testing.T) {
@@ -1144,15 +1135,8 @@ func TestAgent_Lifecycle(t *testing.T) {
11441135
got=client.getLifecycleStates()
11451136
returnlen(got)>0&&got[len(got)-1]==want[len(want)-1]
11461137
},testutil.WaitShort,testutil.IntervalMedium)
1147-
switchlen(got) {
1148-
case1:
1149-
// This can happen if lifecycle state updates are
1150-
// too fast, only the latest one is reported.
1151-
require.Equal(t,want[1:],got)
1152-
default:
1153-
// This is the expected case.
1154-
require.Equal(t,want,got)
1155-
}
1138+
1139+
require.Equal(t,want,got)
11561140
})
11571141

11581142
t.Run("Ready",func(t*testing.T) {
@@ -1173,15 +1157,8 @@ func TestAgent_Lifecycle(t *testing.T) {
11731157
got=client.getLifecycleStates()
11741158
returnlen(got)>0&&got[len(got)-1]==want[len(want)-1]
11751159
},testutil.WaitShort,testutil.IntervalMedium)
1176-
switchlen(got) {
1177-
case1:
1178-
// This can happen if lifecycle state updates are
1179-
// too fast, only the latest one is reported.
1180-
require.Equal(t,want[1:],got)
1181-
default:
1182-
// This is the expected case.
1183-
require.Equal(t,want,got)
1184-
}
1160+
1161+
require.Equal(t,want,got)
11851162
})
11861163

11871164
t.Run("ShuttingDown",func(t*testing.T) {
@@ -1258,15 +1235,7 @@ func TestAgent_Lifecycle(t *testing.T) {
12581235
returnlen(got)>0&&got[len(got)-1]==want[len(want)-1]
12591236
},testutil.WaitShort,testutil.IntervalMedium)
12601237

1261-
switchlen(got) {
1262-
case1:
1263-
// This can happen if lifecycle state updates are
1264-
// too fast, only the latest one is reported.
1265-
require.Equal(t,want[1:],got)
1266-
default:
1267-
// This is the expected case.
1268-
require.Equal(t,want,got)
1269-
}
1238+
require.Equal(t,want,got)
12701239
})
12711240

12721241
t.Run("ShutdownError",func(t*testing.T) {
@@ -1305,15 +1274,7 @@ func TestAgent_Lifecycle(t *testing.T) {
13051274
returnlen(got)>0&&got[len(got)-1]==want[len(want)-1]
13061275
},testutil.WaitShort,testutil.IntervalMedium)
13071276

1308-
switchlen(got) {
1309-
case1:
1310-
// This can happen if lifecycle state updates are
1311-
// too fast, only the latest one is reported.
1312-
require.Equal(t,want[1:],got)
1313-
default:
1314-
// This is the expected case.
1315-
require.Equal(t,want,got)
1316-
}
1277+
require.Equal(t,want,got)
13171278
})
13181279

13191280
t.Run("ShutdownScriptOnce",func(t*testing.T) {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp