@@ -14,6 +14,7 @@ import (
14
14
"net/http"
15
15
"net/netip"
16
16
"os"
17
+ "os/exec"
17
18
"os/user"
18
19
"path/filepath"
19
20
"sort"
@@ -126,6 +127,7 @@ func New(options Options) Agent {
126
127
tempDir :options .TempDir ,
127
128
lifecycleUpdate :make (chan struct {},1 ),
128
129
lifecycleReported :make (chan codersdk.WorkspaceAgentLifecycle ,1 ),
130
+ lifecycleStates : []agentsdk.PostLifecycleRequest {{State :codersdk .WorkspaceAgentLifecycleCreated }},
129
131
ignorePorts :options .IgnorePorts ,
130
132
connStatsChan :make (chan * agentsdk.Stats ,1 ),
131
133
sshMaxTimeout :options .SSHMaxTimeout ,
@@ -170,7 +172,7 @@ type agent struct {
170
172
lifecycleUpdate chan struct {}
171
173
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
172
174
lifecycleMu sync.RWMutex // Protects following.
173
- lifecycleState codersdk. WorkspaceAgentLifecycle
175
+ lifecycleStates []agentsdk. PostLifecycleRequest
174
176
175
177
network * tailnet.Conn
176
178
connStatsChan chan * agentsdk.Stats
@@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
414
416
}
415
417
}
416
418
417
- // reportLifecycleLoop reports the current lifecycle state once.
418
- // Only the latest state is reported, intermediate states may be
419
- // lost if the agent can't communicate with the API.
419
+ // reportLifecycleLoop reports the current lifecycle state once. All state
420
+ // changes are reported in order.
420
421
func (a * agent )reportLifecycleLoop (ctx context.Context ) {
421
- var lastReported codersdk. WorkspaceAgentLifecycle
422
+ lastReportedIndex := 0 // Start off with the created state without reporting it.
422
423
for {
423
424
select {
424
425
case <- a .lifecycleUpdate :
@@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
428
429
429
430
for r := retry .New (time .Second ,15 * time .Second );r .Wait (ctx ); {
430
431
a .lifecycleMu .RLock ()
431
- state := a .lifecycleState
432
+ lastIndex := len (a .lifecycleStates )- 1
433
+ report := a .lifecycleStates [lastReportedIndex ]
434
+ if len (a .lifecycleStates )> lastReportedIndex + 1 {
435
+ report = a .lifecycleStates [lastReportedIndex + 1 ]
436
+ }
432
437
a .lifecycleMu .RUnlock ()
433
438
434
- if state == lastReported {
439
+ if lastIndex == lastReportedIndex {
435
440
break
436
441
}
437
442
438
- a .logger .Debug (ctx ,"reporting lifecycle state" ,slog .F ("state " ,state ))
443
+ a .logger .Debug (ctx ,"reporting lifecycle state" ,slog .F ("payload " ,report ))
439
444
440
- err := a .client .PostLifecycle (ctx , agentsdk.PostLifecycleRequest {
441
- State :state ,
442
- })
445
+ err := a .client .PostLifecycle (ctx ,report )
443
446
if err == nil {
444
- lastReported = state
447
+ lastReportedIndex ++
445
448
select {
446
- case a .lifecycleReported <- state :
449
+ case a .lifecycleReported <- report . State :
447
450
case <- a .lifecycleReported :
448
- a .lifecycleReported <- state
451
+ a .lifecycleReported <- report .State
452
+ }
453
+ if lastReportedIndex < lastIndex {
454
+ // Keep reporting until we've sent all messages, we can't
455
+ // rely on the channel triggering us before the backlog is
456
+ // consumed.
457
+ continue
449
458
}
450
459
break
451
460
}
@@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
461
470
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
462
471
// The state is only updated if it's a valid state transition.
463
472
func (a * agent )setLifecycle (ctx context.Context ,state codersdk.WorkspaceAgentLifecycle ) {
473
+ report := agentsdk.PostLifecycleRequest {
474
+ State :state ,
475
+ ChangedAt :database .Now (),
476
+ }
477
+
464
478
a .lifecycleMu .Lock ()
465
- lastState := a .lifecycleState
466
- if slices .Index (codersdk .WorkspaceAgentLifecycleOrder ,lastState )> slices .Index (codersdk .WorkspaceAgentLifecycleOrder ,state ) {
467
- a .logger .Warn (ctx ,"attempted to set lifecycle state to a previous state" ,slog .F ("last" ,lastState ),slog .F ("state " ,state ))
479
+ lastReport := a .lifecycleStates [ len ( a . lifecycleStates ) - 1 ]
480
+ if slices .Index (codersdk .WorkspaceAgentLifecycleOrder ,lastReport . State )>= slices .Index (codersdk .WorkspaceAgentLifecycleOrder ,report . State ) {
481
+ a .logger .Warn (ctx ,"attempted to set lifecycle state to a previous state" ,slog .F ("last" ,lastReport ),slog .F ("current " ,report ))
468
482
a .lifecycleMu .Unlock ()
469
483
return
470
484
}
471
- a .lifecycleState = state
472
- a .logger .Debug (ctx ,"set lifecycle state" ,slog .F ("state " ,state ),slog .F ("last" ,lastState ))
485
+ a .lifecycleStates = append ( a . lifecycleStates , report )
486
+ a .logger .Debug (ctx ,"set lifecycle state" ,slog .F ("current " ,report ),slog .F ("last" ,lastReport ))
473
487
a .lifecycleMu .Unlock ()
474
488
475
489
select {
@@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error {
534
548
535
549
lifecycleState := codersdk .WorkspaceAgentLifecycleReady
536
550
scriptDone := make (chan error ,1 )
537
- scriptStart := time .Now ()
538
551
err = a .trackConnGoroutine (func () {
539
552
defer close (scriptDone )
540
553
scriptDone <- a .runStartupScript (ctx ,manifest .StartupScript )
@@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error {
556
569
select {
557
570
case err = <- scriptDone :
558
571
case <- timeout :
559
- a .logger .Warn (ctx ,"startup script timed out" )
572
+ a .logger .Warn (ctx ,"script timed out" , slog . F ( "lifecycle" , "startup" ), slog . F ( "timeout" , manifest . ShutdownScriptTimeout ) )
560
573
a .setLifecycle (ctx ,codersdk .WorkspaceAgentLifecycleStartTimeout )
561
574
err = <- scriptDone // The script can still complete after a timeout.
562
575
}
563
- if errors .Is (err ,context .Canceled ) {
564
- return
565
- }
566
- // Only log if there was a startup script.
567
- if manifest .StartupScript != "" {
568
- execTime := time .Since (scriptStart )
569
- if err != nil {
570
- a .logger .Warn (ctx ,"startup script failed" ,slog .F ("execution_time" ,execTime ),slog .Error (err ))
571
- lifecycleState = codersdk .WorkspaceAgentLifecycleStartError
572
- }else {
573
- a .logger .Info (ctx ,"startup script completed" ,slog .F ("execution_time" ,execTime ))
576
+ if err != nil {
577
+ if errors .Is (err ,context .Canceled ) {
578
+ return
574
579
}
580
+ lifecycleState = codersdk .WorkspaceAgentLifecycleStartError
575
581
}
576
582
a .setLifecycle (ctx ,lifecycleState )
577
583
}()
@@ -830,20 +836,31 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error {
830
836
return a .runScript (ctx ,"shutdown" ,script )
831
837
}
832
838
833
- func (a * agent )runScript (ctx context.Context ,lifecycle ,script string )error {
839
+ func (a * agent )runScript (ctx context.Context ,lifecycle ,script string )( err error ) {
834
840
if script == "" {
835
841
return nil
836
842
}
837
843
838
- a .logger .Info (ctx ,"running script" ,slog .F ("lifecycle" ,lifecycle ),slog .F ("script" ,script ))
844
+ logger := a .logger .With (slog .F ("lifecycle" ,lifecycle ))
845
+
846
+ logger .Info (ctx ,fmt .Sprintf ("running %s script" ,lifecycle ),slog .F ("script" ,script ))
839
847
fileWriter ,err := a .filesystem .OpenFile (filepath .Join (a .logDir ,fmt .Sprintf ("coder-%s-script.log" ,lifecycle )),os .O_CREATE | os .O_RDWR ,0o600 )
840
848
if err != nil {
841
849
return xerrors .Errorf ("open %s script log file: %w" ,lifecycle ,err )
842
850
}
843
851
defer func () {
844
- _ = fileWriter .Close ()
852
+ err := fileWriter .Close ()
853
+ if err != nil {
854
+ logger .Warn (ctx ,fmt .Sprintf ("close %s script log file" ,lifecycle ),slog .Error (err ))
855
+ }
845
856
}()
846
857
858
+ cmdPty ,err := a .sshServer .CreateCommand (ctx ,script ,nil )
859
+ if err != nil {
860
+ return xerrors .Errorf ("%s script: create command: %w" ,lifecycle ,err )
861
+ }
862
+ cmd := cmdPty .AsExec ()
863
+
847
864
var writer io.Writer = fileWriter
848
865
if lifecycle == "startup" {
849
866
// Create pipes for startup logs reader and writer
@@ -854,29 +871,42 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
854
871
writer = io .MultiWriter (fileWriter ,logsWriter )
855
872
flushedLogs ,err := a .trackScriptLogs (ctx ,logsReader )
856
873
if err != nil {
857
- return xerrors .Errorf ("track script logs: %w" ,err )
874
+ return xerrors .Errorf ("track%s script logs: %w" , lifecycle ,err )
858
875
}
859
876
defer func () {
860
877
_ = logsWriter .Close ()
861
878
<- flushedLogs
862
879
}()
863
880
}
864
881
865
- cmdPty ,err := a .sshServer .CreateCommand (ctx ,script ,nil )
866
- if err != nil {
867
- return xerrors .Errorf ("create command: %w" ,err )
868
- }
869
- cmd := cmdPty .AsExec ()
870
882
cmd .Stdout = writer
871
883
cmd .Stderr = writer
884
+
885
+ start := time .Now ()
886
+ defer func () {
887
+ end := time .Now ()
888
+ execTime := end .Sub (start )
889
+ exitCode := 0
890
+ if err != nil {
891
+ exitCode = 255 // Unknown status.
892
+ var exitError * exec.ExitError
893
+ if xerrors .As (err ,& exitError ) {
894
+ exitCode = exitError .ExitCode ()
895
+ }
896
+ logger .Warn (ctx ,fmt .Sprintf ("%s script failed" ,lifecycle ),slog .F ("execution_time" ,execTime ),slog .F ("exit_code" ,exitCode ),slog .Error (err ))
897
+ }else {
898
+ logger .Info (ctx ,fmt .Sprintf ("%s script completed" ,lifecycle ),slog .F ("execution_time" ,execTime ),slog .F ("exit_code" ,exitCode ))
899
+ }
900
+ }()
901
+
872
902
err = cmd .Run ()
873
903
if err != nil {
874
904
// cmd.Run does not return a context canceled error, it returns "signal: killed".
875
905
if ctx .Err ()!= nil {
876
906
return ctx .Err ()
877
907
}
878
908
879
- return xerrors .Errorf ("run: %w" ,err )
909
+ return xerrors .Errorf ("%s script: run: %w" , lifecycle ,err )
880
910
}
881
911
return nil
882
912
}
@@ -929,11 +959,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
929
959
if errors .As (err ,& sdkErr ) {
930
960
if sdkErr .StatusCode ()== http .StatusRequestEntityTooLarge {
931
961
a .logger .Warn (ctx ,"startup logs too large, dropping logs" )
932
- // Always send the EOF even if logs overflow.
933
- if len (logsToSend )> 1 && logsToSend [len (logsToSend )- 1 ].EOF {
934
- logsToSend = logsToSend [len (logsToSend )- 1 :]
935
- continue
936
- }
937
962
break
938
963
}
939
964
}
@@ -986,11 +1011,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
986
1011
if err := scanner .Err ();err != nil {
987
1012
a .logger .Error (ctx ,"scan startup logs" ,slog .Error (err ))
988
1013
}
989
- queueLog (agentsdk.StartupLog {
990
- CreatedAt :database .Now (),
991
- Output :"" ,
992
- EOF :true ,
993
- })
994
1014
defer close (logsFinished )
995
1015
logsFlushed .L .Lock ()
996
1016
for {
@@ -1392,7 +1412,6 @@ func (a *agent) Close() error {
1392
1412
lifecycleState := codersdk .WorkspaceAgentLifecycleOff
1393
1413
if manifest := a .manifest .Load ();manifest != nil && manifest .ShutdownScript != "" {
1394
1414
scriptDone := make (chan error ,1 )
1395
- scriptStart := time .Now ()
1396
1415
go func () {
1397
1416
defer close (scriptDone )
1398
1417
scriptDone <- a .runShutdownScript (ctx ,manifest .ShutdownScript )
@@ -1411,16 +1430,12 @@ func (a *agent) Close() error {
1411
1430
select {
1412
1431
case err = <- scriptDone :
1413
1432
case <- timeout :
1414
- a .logger .Warn (ctx ,"shutdown script timed out" )
1433
+ a .logger .Warn (ctx ,"script timed out" , slog . F ( "lifecycle" , "shutdown" ), slog . F ( "timeout" , manifest . ShutdownScriptTimeout ) )
1415
1434
a .setLifecycle (ctx ,codersdk .WorkspaceAgentLifecycleShutdownTimeout )
1416
1435
err = <- scriptDone // The script can still complete after a timeout.
1417
1436
}
1418
- execTime := time .Since (scriptStart )
1419
1437
if err != nil {
1420
- a .logger .Warn (ctx ,"shutdown script failed" ,slog .F ("execution_time" ,execTime ),slog .Error (err ))
1421
1438
lifecycleState = codersdk .WorkspaceAgentLifecycleShutdownError
1422
- }else {
1423
- a .logger .Info (ctx ,"shutdown script completed" ,slog .F ("execution_time" ,execTime ))
1424
1439
}
1425
1440
}
1426
1441