Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

refactor: replace startup script logs EOF with starting/ready time#8082

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 73 additions & 58 deletionsagent/agent.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -14,6 +14,7 @@ import (
"net/http"
"net/netip"
"os"
"os/exec"
"os/user"
"path/filepath"
"sort"
Expand DownExpand Up@@ -126,6 +127,7 @@ func New(options Options) Agent {
tempDir: options.TempDir,
lifecycleUpdate: make(chan struct{}, 1),
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},
ignorePorts: options.IgnorePorts,
connStatsChan: make(chan *agentsdk.Stats, 1),
sshMaxTimeout: options.SSHMaxTimeout,
Expand DownExpand Up@@ -170,7 +172,7 @@ type agent struct {
lifecycleUpdate chan struct{}
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
lifecycleMu sync.RWMutex // Protects following.
lifecycleState codersdk.WorkspaceAgentLifecycle
lifecycleStates[]agentsdk.PostLifecycleRequest

network *tailnet.Conn
connStatsChan chan *agentsdk.Stats
Expand DownExpand Up@@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
}
}

// reportLifecycleLoop reports the current lifecycle state once.
// Only the latest state is reported, intermediate states may be
// lost if the agent can't communicate with the API.
// reportLifecycleLoop reports the current lifecycle state once. All state
// changes are reported in order.
Copy link
MemberAuthor

@mafredrimafredriJun 19, 2023
edited
Loading

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Review: Originally, this was a weird/performance choice to only submit the latest status which simply resulted in needing to document the behavior increasing the complexity. We now change it so that the agent always reports all states (still non-blocking), we achieve this by adding the timestamp for the event in the payload.

mtojek reacted with thumbs up emoji
func (a *agent) reportLifecycleLoop(ctx context.Context) {
var lastReported codersdk.WorkspaceAgentLifecycle
lastReportedIndex := 0 // Start off with the created state without reporting it.
for {
select {
case <-a.lifecycleUpdate:
Expand All@@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {

for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
a.lifecycleMu.RLock()
state := a.lifecycleState
lastIndex := len(a.lifecycleStates) - 1
report := a.lifecycleStates[lastReportedIndex]
if len(a.lifecycleStates) > lastReportedIndex+1 {
report = a.lifecycleStates[lastReportedIndex+1]
}
a.lifecycleMu.RUnlock()

ifstate ==lastReported {
iflastIndex ==lastReportedIndex {
break
}

a.logger.Debug(ctx, "reporting lifecycle state", slog.F("state",state))
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("payload",report))

err := a.client.PostLifecycle(ctx, agentsdk.PostLifecycleRequest{
State: state,
})
err := a.client.PostLifecycle(ctx, report)
if err == nil {
lastReported = state
lastReportedIndex++
select {
case a.lifecycleReported <-state:
case a.lifecycleReported <-report.State:
case <-a.lifecycleReported:
a.lifecycleReported <- state
a.lifecycleReported <- report.State
}
if lastReportedIndex < lastIndex {
// Keep reporting until we've sent all messages, we can't
// rely on the channel triggering us before the backlog is
// consumed.
continue
}
break
}
Expand All@@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
// The state is only updated if it's a valid state transition.
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
report := agentsdk.PostLifecycleRequest{
State: state,
ChangedAt: database.Now(),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

database.Now() ortime.Now()?

Copy link
MemberAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Intentionally useddatabase.Now() here for consistency since it's a value that will be stored in the DB. Startup logs also usedatabase.Now() but we do seem to have some mixed use in the agent and some other places may be wrong?

Thoughts@kylecarbs? For all intents and purposes, this shouldn't matter since the DB fields aretimestamptz, what's the motivation for usingdatabase.Now() that always uses UTC? Logging purposes?

mtojek reacted with eyes emoji
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

what's the motivation for using database.Now() that always uses UTC? Logging purposes?

  • It might be a good candidate for the linter rule.

}

a.lifecycleMu.Lock()
lastState := a.lifecycleState
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder,lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder,state) {
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last",lastState), slog.F("state",state))
lastReport := a.lifecycleStates[len(a.lifecycleStates)-1]
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder,lastReport.State) >= slices.Index(codersdk.WorkspaceAgentLifecycleOrder,report.State) {
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last",lastReport), slog.F("current",report))
a.lifecycleMu.Unlock()
return
}
a.lifecycleState =state
a.logger.Debug(ctx, "set lifecycle state", slog.F("state",state), slog.F("last",lastState))
a.lifecycleStates =append(a.lifecycleStates, report)
Copy link
MemberAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Review: Guaranteed to not grow unboundedly (len(a.lifecycleStates) <= enum entries).

a.logger.Debug(ctx, "set lifecycle state", slog.F("current",report), slog.F("last",lastReport))
a.lifecycleMu.Unlock()

select {
Expand DownExpand Up@@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error {

lifecycleState := codersdk.WorkspaceAgentLifecycleReady
scriptDone := make(chan error, 1)
scriptStart := time.Now()
err = a.trackConnGoroutine(func() {
defer close(scriptDone)
scriptDone <- a.runStartupScript(ctx, manifest.StartupScript)
Expand All@@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error {
select {
case err = <-scriptDone:
case <-timeout:
a.logger.Warn(ctx, "startupscript timed out")
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "startup"), slog.F("timeout", manifest.ShutdownScriptTimeout))
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
err = <-scriptDone // The script can still complete after a timeout.
}
if errors.Is(err, context.Canceled) {
return
}
// Only log if there was a startup script.
if manifest.StartupScript != "" {
execTime := time.Since(scriptStart)
if err != nil {
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
} else {
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
if err != nil {
if errors.Is(err, context.Canceled) {
return
}
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
}
a.setLifecycle(ctx, lifecycleState)
}()
Expand DownExpand Up@@ -830,20 +836,31 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error {
return a.runScript(ctx, "shutdown", script)
}

func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
func (a *agent) runScript(ctx context.Context, lifecycle, script string)(errerror) {
if script == "" {
return nil
}

a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
logger := a.logger.With(slog.F("lifecycle", lifecycle))

logger.Info(ctx, fmt.Sprintf("running %s script", lifecycle), slog.F("script", script))
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
if err != nil {
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
}
defer func() {
_ = fileWriter.Close()
err := fileWriter.Close()
if err != nil {
logger.Warn(ctx, fmt.Sprintf("close %s script log file", lifecycle), slog.Error(err))
}
}()

cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
if err != nil {
return xerrors.Errorf("%s script: create command: %w", lifecycle, err)
}
cmd := cmdPty.AsExec()

var writer io.Writer = fileWriter
if lifecycle == "startup" {
// Create pipes for startup logs reader and writer
Expand All@@ -854,29 +871,42 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
writer = io.MultiWriter(fileWriter, logsWriter)
flushedLogs, err := a.trackScriptLogs(ctx, logsReader)
if err != nil {
return xerrors.Errorf("track script logs: %w", err)
return xerrors.Errorf("track%sscript logs: %w", lifecycle, err)
}
defer func() {
_ = logsWriter.Close()
<-flushedLogs
}()
}

cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
if err != nil {
return xerrors.Errorf("create command: %w", err)
}
cmd := cmdPty.AsExec()
cmd.Stdout = writer
cmd.Stderr = writer

start := time.Now()
defer func() {
end := time.Now()
execTime := end.Sub(start)
exitCode := 0
if err != nil {
exitCode = 255 // Unknown status.
var exitError *exec.ExitError
if xerrors.As(err, &exitError) {
exitCode = exitError.ExitCode()
}
logger.Warn(ctx, fmt.Sprintf("%s script failed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode), slog.Error(err))
} else {
logger.Info(ctx, fmt.Sprintf("%s script completed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
}
}()
Copy link
MemberAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Review: Small cleanup to unify logging between startup and shutdown scripts.

mtojek reacted with thumbs up emoji

err = cmd.Run()
if err != nil {
// cmd.Run does not return a context canceled error, it returns "signal: killed".
if ctx.Err() != nil {
return ctx.Err()
}

return xerrors.Errorf("run: %w", err)
return xerrors.Errorf("%s script:run: %w", lifecycle, err)
}
return nil
}
Expand DownExpand Up@@ -929,11 +959,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
if errors.As(err, &sdkErr) {
if sdkErr.StatusCode() == http.StatusRequestEntityTooLarge {
a.logger.Warn(ctx, "startup logs too large, dropping logs")
// Always send the EOF even if logs overflow.
if len(logsToSend) > 1 && logsToSend[len(logsToSend)-1].EOF {
logsToSend = logsToSend[len(logsToSend)-1:]
continue
}
break
}
}
Expand DownExpand Up@@ -986,11 +1011,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
if err := scanner.Err(); err != nil {
a.logger.Error(ctx, "scan startup logs", slog.Error(err))
}
queueLog(agentsdk.StartupLog{
CreatedAt: database.Now(),
Output: "",
EOF: true,
})
defer close(logsFinished)
logsFlushed.L.Lock()
for {
Expand DownExpand Up@@ -1392,7 +1412,6 @@ func (a *agent) Close() error {
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
if manifest := a.manifest.Load(); manifest != nil && manifest.ShutdownScript != "" {
scriptDone := make(chan error, 1)
scriptStart := time.Now()
go func() {
defer close(scriptDone)
scriptDone <- a.runShutdownScript(ctx, manifest.ShutdownScript)
Expand All@@ -1411,16 +1430,12 @@ func (a *agent) Close() error {
select {
case err = <-scriptDone:
case <-timeout:
a.logger.Warn(ctx, "shutdownscript timed out")
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "shutdown"), slog.F("timeout", manifest.ShutdownScriptTimeout))
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
err = <-scriptDone // The script can still complete after a timeout.
}
execTime := time.Since(scriptStart)
if err != nil {
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err))
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
} else {
a.logger.Info(ctx, "shutdown script completed", slog.F("execution_time", execTime))
}
}

Expand Down
57 changes: 9 additions & 48 deletionsagent/agent_test.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -905,10 +905,8 @@ func TestAgent_StartupScript(t *testing.T) {
return len(got) > 0 && got[len(got)-1] == codersdk.WorkspaceAgentLifecycleReady
}, testutil.WaitShort, testutil.IntervalMedium)

require.Len(t, client.getStartupLogs(),2)
require.Len(t, client.getStartupLogs(),1)
require.Equal(t, output, client.getStartupLogs()[0].Output)
require.False(t, client.getStartupLogs()[0].EOF)
require.True(t, client.getStartupLogs()[1].EOF)
})
// This ensures that even when coderd sends back that the startup
// script has written too many lines it will still succeed!
Expand DownExpand Up@@ -1115,15 +1113,8 @@ func TestAgent_Lifecycle(t *testing.T) {
got = client.getLifecycleStates()
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
}, testutil.WaitShort, testutil.IntervalMedium)
switch len(got) {
case 1:
// This can happen if lifecycle state updates are
// too fast, only the latest one is reported.
require.Equal(t, want[1:], got)
default:
// This is the expected case.
require.Equal(t, want, got)
}

require.Equal(t, want, got)
})

t.Run("StartError", func(t *testing.T) {
Expand All@@ -1144,15 +1135,8 @@ func TestAgent_Lifecycle(t *testing.T) {
got = client.getLifecycleStates()
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
}, testutil.WaitShort, testutil.IntervalMedium)
switch len(got) {
case 1:
// This can happen if lifecycle state updates are
// too fast, only the latest one is reported.
require.Equal(t, want[1:], got)
default:
// This is the expected case.
require.Equal(t, want, got)
}

require.Equal(t, want, got)
})

t.Run("Ready", func(t *testing.T) {
Expand All@@ -1173,15 +1157,8 @@ func TestAgent_Lifecycle(t *testing.T) {
got = client.getLifecycleStates()
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
}, testutil.WaitShort, testutil.IntervalMedium)
switch len(got) {
case 1:
// This can happen if lifecycle state updates are
// too fast, only the latest one is reported.
require.Equal(t, want[1:], got)
default:
// This is the expected case.
require.Equal(t, want, got)
}

require.Equal(t, want, got)
})

t.Run("ShuttingDown", func(t *testing.T) {
Expand DownExpand Up@@ -1258,15 +1235,7 @@ func TestAgent_Lifecycle(t *testing.T) {
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
}, testutil.WaitShort, testutil.IntervalMedium)

switch len(got) {
case 1:
// This can happen if lifecycle state updates are
// too fast, only the latest one is reported.
require.Equal(t, want[1:], got)
default:
// This is the expected case.
require.Equal(t, want, got)
}
require.Equal(t, want, got)
})

t.Run("ShutdownError", func(t *testing.T) {
Expand DownExpand Up@@ -1305,15 +1274,7 @@ func TestAgent_Lifecycle(t *testing.T) {
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
}, testutil.WaitShort, testutil.IntervalMedium)

switch len(got) {
case 1:
// This can happen if lifecycle state updates are
// too fast, only the latest one is reported.
require.Equal(t, want[1:], got)
default:
// This is the expected case.
require.Equal(t, want, got)
}
require.Equal(t, want, got)
})

t.Run("ShutdownScriptOnce", func(t *testing.T) {
Expand Down
Loading

[8]ページ先頭

©2009-2025 Movatter.jp