- Notifications
You must be signed in to change notification settings - Fork928
refactor: replace startup script logs EOF with starting/ready time#8082
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
73f19d6
43a0f87
8ff8c02
b4f78eb
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -14,6 +14,7 @@ import ( | ||
"net/http" | ||
"net/netip" | ||
"os" | ||
"os/exec" | ||
"os/user" | ||
"path/filepath" | ||
"sort" | ||
@@ -126,6 +127,7 @@ func New(options Options) Agent { | ||
tempDir: options.TempDir, | ||
lifecycleUpdate: make(chan struct{}, 1), | ||
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1), | ||
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}}, | ||
ignorePorts: options.IgnorePorts, | ||
connStatsChan: make(chan *agentsdk.Stats, 1), | ||
sshMaxTimeout: options.SSHMaxTimeout, | ||
@@ -170,7 +172,7 @@ type agent struct { | ||
lifecycleUpdate chan struct{} | ||
lifecycleReported chan codersdk.WorkspaceAgentLifecycle | ||
lifecycleMu sync.RWMutex // Protects following. | ||
lifecycleStates[]agentsdk.PostLifecycleRequest | ||
network *tailnet.Conn | ||
connStatsChan chan *agentsdk.Stats | ||
@@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) { | ||
} | ||
} | ||
// reportLifecycleLoop reports the current lifecycle state once. All state | ||
// changes are reported in order. | ||
MemberAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Review: Originally, this was a weird/performance choice to only submit the latest status which simply resulted in needing to document the behavior increasing the complexity. We now change it so that the agent always reports all states (still non-blocking), we achieve this by adding the timestamp for the event in the payload. | ||
func (a *agent) reportLifecycleLoop(ctx context.Context) { | ||
lastReportedIndex := 0 // Start off with the created state without reporting it. | ||
for { | ||
select { | ||
case <-a.lifecycleUpdate: | ||
@@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) { | ||
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); { | ||
a.lifecycleMu.RLock() | ||
lastIndex := len(a.lifecycleStates) - 1 | ||
report := a.lifecycleStates[lastReportedIndex] | ||
if len(a.lifecycleStates) > lastReportedIndex+1 { | ||
report = a.lifecycleStates[lastReportedIndex+1] | ||
} | ||
a.lifecycleMu.RUnlock() | ||
iflastIndex ==lastReportedIndex { | ||
break | ||
} | ||
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("payload",report)) | ||
err := a.client.PostLifecycle(ctx, report) | ||
if err == nil { | ||
lastReportedIndex++ | ||
select { | ||
case a.lifecycleReported <-report.State: | ||
case <-a.lifecycleReported: | ||
a.lifecycleReported <- report.State | ||
} | ||
if lastReportedIndex < lastIndex { | ||
// Keep reporting until we've sent all messages, we can't | ||
// rely on the channel triggering us before the backlog is | ||
// consumed. | ||
continue | ||
} | ||
break | ||
} | ||
@@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) { | ||
// setLifecycle sets the lifecycle state and notifies the lifecycle loop. | ||
// The state is only updated if it's a valid state transition. | ||
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) { | ||
report := agentsdk.PostLifecycleRequest{ | ||
State: state, | ||
ChangedAt: database.Now(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Intentionally used Thoughts@kylecarbs? For all intents and purposes, this shouldn't matter since the DB fields are There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more.
| ||
} | ||
a.lifecycleMu.Lock() | ||
lastReport := a.lifecycleStates[len(a.lifecycleStates)-1] | ||
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder,lastReport.State) >= slices.Index(codersdk.WorkspaceAgentLifecycleOrder,report.State) { | ||
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last",lastReport), slog.F("current",report)) | ||
a.lifecycleMu.Unlock() | ||
return | ||
} | ||
a.lifecycleStates =append(a.lifecycleStates, report) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Review: Guaranteed to not grow unboundedly ( | ||
a.logger.Debug(ctx, "set lifecycle state", slog.F("current",report), slog.F("last",lastReport)) | ||
a.lifecycleMu.Unlock() | ||
select { | ||
@@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error { | ||
lifecycleState := codersdk.WorkspaceAgentLifecycleReady | ||
scriptDone := make(chan error, 1) | ||
err = a.trackConnGoroutine(func() { | ||
defer close(scriptDone) | ||
scriptDone <- a.runStartupScript(ctx, manifest.StartupScript) | ||
@@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error { | ||
select { | ||
case err = <-scriptDone: | ||
case <-timeout: | ||
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "startup"), slog.F("timeout", manifest.ShutdownScriptTimeout)) | ||
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout) | ||
err = <-scriptDone // The script can still complete after a timeout. | ||
} | ||
if err != nil { | ||
if errors.Is(err, context.Canceled) { | ||
return | ||
} | ||
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError | ||
} | ||
a.setLifecycle(ctx, lifecycleState) | ||
}() | ||
@@ -830,20 +836,31 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error { | ||
return a.runScript(ctx, "shutdown", script) | ||
} | ||
func (a *agent) runScript(ctx context.Context, lifecycle, script string)(errerror) { | ||
if script == "" { | ||
return nil | ||
} | ||
logger := a.logger.With(slog.F("lifecycle", lifecycle)) | ||
logger.Info(ctx, fmt.Sprintf("running %s script", lifecycle), slog.F("script", script)) | ||
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600) | ||
if err != nil { | ||
return xerrors.Errorf("open %s script log file: %w", lifecycle, err) | ||
} | ||
defer func() { | ||
err := fileWriter.Close() | ||
if err != nil { | ||
logger.Warn(ctx, fmt.Sprintf("close %s script log file", lifecycle), slog.Error(err)) | ||
} | ||
}() | ||
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil) | ||
if err != nil { | ||
return xerrors.Errorf("%s script: create command: %w", lifecycle, err) | ||
} | ||
cmd := cmdPty.AsExec() | ||
var writer io.Writer = fileWriter | ||
if lifecycle == "startup" { | ||
// Create pipes for startup logs reader and writer | ||
@@ -854,29 +871,42 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error { | ||
writer = io.MultiWriter(fileWriter, logsWriter) | ||
flushedLogs, err := a.trackScriptLogs(ctx, logsReader) | ||
if err != nil { | ||
return xerrors.Errorf("track%sscript logs: %w", lifecycle, err) | ||
} | ||
defer func() { | ||
_ = logsWriter.Close() | ||
<-flushedLogs | ||
}() | ||
} | ||
cmd.Stdout = writer | ||
cmd.Stderr = writer | ||
start := time.Now() | ||
defer func() { | ||
end := time.Now() | ||
execTime := end.Sub(start) | ||
exitCode := 0 | ||
if err != nil { | ||
exitCode = 255 // Unknown status. | ||
var exitError *exec.ExitError | ||
if xerrors.As(err, &exitError) { | ||
exitCode = exitError.ExitCode() | ||
} | ||
logger.Warn(ctx, fmt.Sprintf("%s script failed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode), slog.Error(err)) | ||
} else { | ||
logger.Info(ctx, fmt.Sprintf("%s script completed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode)) | ||
} | ||
}() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Review: Small cleanup to unify logging between startup and shutdown scripts. | ||
err = cmd.Run() | ||
if err != nil { | ||
// cmd.Run does not return a context canceled error, it returns "signal: killed". | ||
if ctx.Err() != nil { | ||
return ctx.Err() | ||
} | ||
return xerrors.Errorf("%s script:run: %w", lifecycle, err) | ||
} | ||
return nil | ||
} | ||
@@ -929,11 +959,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str | ||
if errors.As(err, &sdkErr) { | ||
if sdkErr.StatusCode() == http.StatusRequestEntityTooLarge { | ||
a.logger.Warn(ctx, "startup logs too large, dropping logs") | ||
break | ||
} | ||
} | ||
@@ -986,11 +1011,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str | ||
if err := scanner.Err(); err != nil { | ||
a.logger.Error(ctx, "scan startup logs", slog.Error(err)) | ||
} | ||
defer close(logsFinished) | ||
logsFlushed.L.Lock() | ||
for { | ||
@@ -1392,7 +1412,6 @@ func (a *agent) Close() error { | ||
lifecycleState := codersdk.WorkspaceAgentLifecycleOff | ||
if manifest := a.manifest.Load(); manifest != nil && manifest.ShutdownScript != "" { | ||
scriptDone := make(chan error, 1) | ||
go func() { | ||
defer close(scriptDone) | ||
scriptDone <- a.runShutdownScript(ctx, manifest.ShutdownScript) | ||
@@ -1411,16 +1430,12 @@ func (a *agent) Close() error { | ||
select { | ||
case err = <-scriptDone: | ||
case <-timeout: | ||
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "shutdown"), slog.F("timeout", manifest.ShutdownScriptTimeout)) | ||
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout) | ||
err = <-scriptDone // The script can still complete after a timeout. | ||
} | ||
if err != nil { | ||
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError | ||
} | ||
} | ||
Uh oh!
There was an error while loading.Please reload this page.