Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

feat: add agent timings#14713

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
DanielleMaywood merged 57 commits intomainfromdm-add-agent-timings
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
Show all changes
57 commits
Select commitHold shift + click to select a range
bcfacf7
feat: begin impl of agent script timings
DanielleMaywoodSep 17, 2024
d55b2e6
feat: add job_id and display_name to script timings
DanielleMaywoodSep 17, 2024
05d8a8b
fix: increment migration number
DanielleMaywoodSep 17, 2024
d54f6c3
fix: rename migrations from 251 to 254
DanielleMaywoodSep 18, 2024
d573c67
test: get tests compiling
DanielleMaywoodSep 18, 2024
09a1a42
fix: appease the linter
DanielleMaywoodSep 18, 2024
6d829f3
fix: get tests passing again
DanielleMaywoodSep 18, 2024
4df7831
fix: drop column from correct table
DanielleMaywoodSep 18, 2024
2565f0a
test: add fixture for agent script timings
DanielleMaywoodSep 18, 2024
669b837
fix: typo
DanielleMaywoodSep 18, 2024
df51673
fix: use job id used in provisioner job timings
DanielleMaywoodSep 18, 2024
c6d71d1
fix: increment migration number
DanielleMaywoodSep 18, 2024
a306e48
test: behaviour of script runner
DanielleMaywoodSep 18, 2024
4b0056c
test: rewrite test
DanielleMaywoodSep 18, 2024
14b68a1
test: does exit 1 script break things?
DanielleMaywoodSep 18, 2024
b22cb57
test: rewrite test again
DanielleMaywoodSep 18, 2024
ef5e2fe
fix: revert change
DanielleMaywoodSep 19, 2024
a5b412b
fix: let code breathe
DanielleMaywoodSep 19, 2024
57ebf30
fix: wrap errors
DanielleMaywoodSep 19, 2024
2a49f67
fix: justify nolint
DanielleMaywoodSep 19, 2024
0698584
fix: swap require.Equal argument order
DanielleMaywoodSep 19, 2024
3580069
fix: add mutex operations
DanielleMaywoodSep 19, 2024
9cb1252
feat: add 'ran_on_start' and 'blocked_login' fields
DanielleMaywoodSep 19, 2024
0b0d1ef
fix: update testdata fixture
DanielleMaywoodSep 19, 2024
eb857c1
fix: refer to agent_id instead of job_id in timings
DanielleMaywoodSep 19, 2024
586d88f
fix: JobID -> AgentID in dbauthz_test
DanielleMaywoodSep 19, 2024
b6289bb
fix: add 'id' to scripts, make timing refer to script id
DanielleMaywoodSep 19, 2024
863c3dc
fix: fix broken tests and convert bug
DanielleMaywoodSep 19, 2024
b8d5d1d
fix: update testdata fixtures
DanielleMaywoodSep 19, 2024
61c26ea
fix: update testdata fixtures again
DanielleMaywoodSep 19, 2024
7f8b6f9
feat: capture stage and if script timed out
DanielleMaywoodSep 20, 2024
c20ac32
fix: update migration number
DanielleMaywoodSep 20, 2024
c652133
test: add test for script api
DanielleMaywoodSep 20, 2024
1199b64
fix: fake db query
DanielleMaywoodSep 20, 2024
afa61eb
fix: use UTC time
DanielleMaywoodSep 20, 2024
8d325e2
fix: ensure r.scriptComplete is not nil
DanielleMaywoodSep 20, 2024
aeee582
fix: move err check to right after call
DanielleMaywoodSep 20, 2024
424069c
fix: uppercase sql
DanielleMaywoodSep 20, 2024
9b43a94
fix: use dbtime.Now()
DanielleMaywoodSep 20, 2024
ed31199
fix: debug log on r.scriptCompleted being nil
DanielleMaywoodSep 20, 2024
8814711
fix: ensure correct rbac permissions
DanielleMaywoodSep 20, 2024
0414623
chore: remove DisplayName
DanielleMaywoodSep 20, 2024
a8192a5
Merge branch 'main' into dm-add-agent-timings
DanielleMaywoodSep 20, 2024
08a466a
fix: get tests passing
DanielleMaywoodSep 20, 2024
8e7c757
fix: remove space in sql up
DanielleMaywoodSep 20, 2024
621071e
docs: document ExecuteOption
DanielleMaywoodSep 20, 2024
3b4df92
fix: drop 'RETURNING' from sql
DanielleMaywoodSep 20, 2024
44127b8
chore: remove 'display_name' from timing table
DanielleMaywoodSep 20, 2024
5846216
fix: testdata fixture
DanielleMaywoodSep 20, 2024
c43a143
fix: put r.scriptCompleted call in goroutine
DanielleMaywoodSep 23, 2024
13889bf
fix: track goroutine for test + use separate context for reporting
DanielleMaywoodSep 23, 2024
bf32b89
fix: appease linter, handle trackCommandGoroutine error
DanielleMaywoodSep 23, 2024
7fe6d8c
fix: resolve race condition
DanielleMaywoodSep 23, 2024
d7e86c6
feat: replace timed_out column with status column
DanielleMaywoodSep 23, 2024
6e338f2
test: update testdata fixture
DanielleMaywoodSep 23, 2024
79a620b
fix: apply suggestions from review
DanielleMaywoodSep 24, 2024
180307f
revert: linter changes
DanielleMaywoodSep 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletionsagent/agent.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -941,17 +941,15 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
}
}

err = a.scriptRunner.Init(manifest.Scripts)
err = a.scriptRunner.Init(manifest.Scripts, aAPI.ScriptCompleted)
if err != nil {
return xerrors.Errorf("init script runner: %w", err)
}
err = a.trackGoroutine(func() {
start := time.Now()
// here we use the graceful context because the script runner is not directly tied
// to the agent API.
err := a.scriptRunner.Execute(a.gracefulCtx, func(script codersdk.WorkspaceAgentScript) bool {
return script.RunOnStart
})
err := a.scriptRunner.Execute(a.gracefulCtx, agentscripts.ExecuteStartScripts)
// Measure the time immediately after the script has finished
dur := time.Since(start).Seconds()
if err != nil {
Expand DownExpand Up@@ -1844,9 +1842,7 @@ func (a *agent) Close() error {
a.gracefulCancel()

lifecycleState := codersdk.WorkspaceAgentLifecycleOff
err = a.scriptRunner.Execute(a.hardCtx, func(script codersdk.WorkspaceAgentScript) bool {
return script.RunOnStop
})
err = a.scriptRunner.Execute(a.hardCtx, agentscripts.ExecuteStopScripts)
if err != nil {
a.logger.Warn(a.hardCtx, "shutdown script(s) failed", slog.Error(err))
if errors.Is(err, agentscripts.ErrTimeout) {
Expand Down
2 changes: 2 additions & 0 deletionsagent/agent_test.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -1517,10 +1517,12 @@ func TestAgent_Lifecycle(t *testing.T) {
agentsdk.Manifest{
DERPMap: derpMap,
Scripts: []codersdk.WorkspaceAgentScript{{
ID: uuid.New(),
LogPath: "coder-startup-script.log",
Script: "echo 1",
RunOnStart: true,
}, {
ID: uuid.New(),
LogPath: "coder-shutdown-script.log",
Script: "echo " + expected,
RunOnStop: true,
Expand Down
122 changes: 97 additions & 25 deletionsagent/agentscripts/agentscripts.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -19,10 +19,13 @@ import (
"github.com/spf13/afero"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"google.golang.org/protobuf/types/known/timestamppb"

"cdr.dev/slog"

"github.com/coder/coder/v2/agent/agentssh"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
)
Expand DownExpand Up@@ -75,18 +78,21 @@ func New(opts Options) *Runner {
}
}

type ScriptCompletedFunc func(context.Context, *proto.WorkspaceAgentScriptCompletedRequest) (*proto.WorkspaceAgentScriptCompletedResponse, error)

type Runner struct {
Options

cronCtx context.Context
cronCtxCancel context.CancelFunc
cmdCloseWait sync.WaitGroup
closed chan struct{}
closeMutex sync.Mutex
cron *cron.Cron
initialized atomic.Bool
scripts []codersdk.WorkspaceAgentScript
dataDir string
cronCtx context.Context
cronCtxCancel context.CancelFunc
cmdCloseWait sync.WaitGroup
closed chan struct{}
closeMutex sync.Mutex
cron *cron.Cron
initialized atomic.Bool
scripts []codersdk.WorkspaceAgentScript
dataDir string
scriptCompleted ScriptCompletedFunc

// scriptsExecuted includes all scripts executed by the workspace agent. Agents
// execute startup scripts, and scripts on a cron schedule. Both will increment
Expand DownExpand Up@@ -116,12 +122,13 @@ func (r *Runner) RegisterMetrics(reg prometheus.Registerer) {
// Init initializes the runner with the provided scripts.
// It also schedules any scripts that have a schedule.
// This function must be called before Execute.
func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript, scriptCompleted ScriptCompletedFunc) error {
if r.initialized.Load() {
return xerrors.New("init: already initialized")
}
r.initialized.Store(true)
r.scripts = scripts
r.scriptCompleted = scriptCompleted
r.Logger.Info(r.cronCtx, "initializing agent scripts", slog.F("script_count", len(scripts)), slog.F("log_dir", r.LogDir))

err := r.Filesystem.MkdirAll(r.ScriptBinDir(), 0o700)
Expand All@@ -135,7 +142,7 @@ func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
}
script := script
_, err := r.cron.AddFunc(script.Cron, func() {
err := r.trackRun(r.cronCtx, script)
err := r.trackRun(r.cronCtx, script, ExecuteCronScripts)
if err != nil {
r.Logger.Warn(context.Background(), "run agent script on schedule", slog.Error(err))
}
Expand DownExpand Up@@ -172,22 +179,33 @@ func (r *Runner) StartCron() {
}
}

// ExecuteOption describes what scripts we want to execute.
type ExecuteOption int

// ExecuteOption enums.
const (
ExecuteAllScripts ExecuteOption = iota
ExecuteStartScripts
ExecuteStopScripts
ExecuteCronScripts
)

// Execute runs a set of scripts according to a filter.
func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.WorkspaceAgentScript) bool) error {
if filter == nil {
// Execute em' all!
filter = func(script codersdk.WorkspaceAgentScript) bool {
return true
}
}
func (r *Runner) Execute(ctx context.Context, option ExecuteOption) error {
var eg errgroup.Group
for _, script := range r.scripts {
if !filter(script) {
runScript := (option == ExecuteStartScripts && script.RunOnStart) ||
(option == ExecuteStopScripts && script.RunOnStop) ||
(option == ExecuteCronScripts && script.Cron != "") ||
option == ExecuteAllScripts

if !runScript {
continue
}

script := script
eg.Go(func() error {
err := r.trackRun(ctx, script)
err := r.trackRun(ctx, script, option)
if err != nil {
return xerrors.Errorf("run agent script %q: %w", script.LogSourceID, err)
}
Expand All@@ -198,8 +216,8 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp
}

// trackRun wraps "run" with metrics.
func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
err := r.run(ctx, script)
func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript, option ExecuteOption) error {
err := r.run(ctx, script, option)
if err != nil {
r.scriptsExecuted.WithLabelValues("false").Add(1)
} else {
Expand All@@ -212,7 +230,7 @@ func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScr
// If the timeout is exceeded, the process is sent an interrupt signal.
// If the process does not exit after a few seconds, it is forcefully killed.
// This function immediately returns after a timeout, and does not wait for the process to exit.
func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript, option ExecuteOption) error {
logPath := script.LogPath
if logPath == "" {
logPath = fmt.Sprintf("coder-script-%s.log", script.LogSourceID)
Expand DownExpand Up@@ -299,9 +317,9 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
cmd.Stdout = io.MultiWriter(fileWriter, infoW)
cmd.Stderr = io.MultiWriter(fileWriter, errW)

start :=time.Now()
start :=dbtime.Now()
defer func() {
end :=time.Now()
end :=dbtime.Now()
execTime := end.Sub(start)
exitCode := 0
if err != nil {
Expand All@@ -314,6 +332,60 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
} else {
logger.Info(ctx, fmt.Sprintf("%s script completed", logPath), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
}

if r.scriptCompleted == nil {
logger.Debug(ctx, "r.scriptCompleted unexpectedly nil")
return
}

// We want to check this outside of the goroutine to avoid a race condition
timedOut := errors.Is(err, ErrTimeout)
pipesLeftOpen := errors.Is(err, ErrOutputPipesOpen)

err = r.trackCommandGoroutine(func() {
var stage proto.Timing_Stage
switch option {
case ExecuteStartScripts:
stage = proto.Timing_START
case ExecuteStopScripts:
stage = proto.Timing_STOP
case ExecuteCronScripts:
stage = proto.Timing_CRON
}

var status proto.Timing_Status
switch {
case timedOut:
status = proto.Timing_TIMED_OUT
case pipesLeftOpen:
status = proto.Timing_PIPES_LEFT_OPEN
case exitCode != 0:
status = proto.Timing_EXIT_FAILURE
default:
status = proto.Timing_OK
}

reportTimeout := 30 * time.Second
reportCtx, cancel := context.WithTimeout(context.Background(), reportTimeout)
defer cancel()

_, err := r.scriptCompleted(reportCtx, &proto.WorkspaceAgentScriptCompletedRequest{
Timing: &proto.Timing{
ScriptId: script.ID[:],
Start: timestamppb.New(start),
End: timestamppb.New(end),
ExitCode: int32(exitCode),
Stage: stage,
Status: status,
},
})
if err != nil {
logger.Error(ctx, fmt.Sprintf("reporting script completed: %s", err.Error()))
}
})
if err != nil {
logger.Error(ctx, fmt.Sprintf("reporting script completed: track command goroutine: %s", err.Error()))
}
}()

err = cmd.Start()
Expand Down
50 changes: 40 additions & 10 deletionsagent/agentscripts/agentscripts_test.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -17,6 +17,7 @@ import (
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/v2/agent/agentscripts"
"github.com/coder/coder/v2/agent/agentssh"
"github.com/coder/coder/v2/agent/agenttest"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/testutil"
Expand All@@ -34,14 +35,13 @@ func TestExecuteBasic(t *testing.T) {
return fLogger
})
defer runner.Close()
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
err := runner.Init([]codersdk.WorkspaceAgentScript{{
LogSourceID: uuid.New(),
Script: "echo hello",
}})
}}, aAPI.ScriptCompleted)
require.NoError(t, err)
require.NoError(t, runner.Execute(context.Background(), func(script codersdk.WorkspaceAgentScript) bool {
return true
}))
require.NoError(t, runner.Execute(context.Background(), agentscripts.ExecuteAllScripts))
log := testutil.RequireRecvCtx(ctx, t, fLogger.logs)
require.Equal(t, "hello", log.Output)
}
Expand All@@ -61,18 +61,17 @@ func TestEnv(t *testing.T) {
cmd.exe /c echo %CODER_SCRIPT_BIN_DIR%
`
}
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
err := runner.Init([]codersdk.WorkspaceAgentScript{{
LogSourceID: id,
Script: script,
}})
}}, aAPI.ScriptCompleted)
require.NoError(t, err)

ctx := testutil.Context(t, testutil.WaitLong)

done := testutil.Go(t, func() {
err := runner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool {
return true
})
err := runner.Execute(ctx, agentscripts.ExecuteAllScripts)
assert.NoError(t, err)
})
defer func() {
Expand DownExpand Up@@ -103,13 +102,44 @@ func TestTimeout(t *testing.T) {
t.Parallel()
runner := setup(t, nil)
defer runner.Close()
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
err := runner.Init([]codersdk.WorkspaceAgentScript{{
LogSourceID: uuid.New(),
Script: "sleep infinity",
Timeout: time.Millisecond,
}})
}}, aAPI.ScriptCompleted)
require.NoError(t, err)
require.ErrorIs(t, runner.Execute(context.Background(), nil), agentscripts.ErrTimeout)
require.ErrorIs(t, runner.Execute(context.Background(), agentscripts.ExecuteAllScripts), agentscripts.ErrTimeout)
}

func TestScriptReportsTiming(t *testing.T) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

I think we need to expand this a bit to cover the timeout and non-zero exit status use-cases.

mafredri reacted with thumbs up emoji
Copy link
ContributorAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Yeah, definitely a good idea.

I had tried a non-zero exit code but it appeared to cause the test to fail so I'll investigate that again to make sure it wasn't me doing something wrong.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Potentially a follow-up PR?

t.Parallel()

ctx := testutil.Context(t, testutil.WaitShort)
fLogger := newFakeScriptLogger()
runner := setup(t, func(uuid2 uuid.UUID) agentscripts.ScriptLogger {
return fLogger
})

aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
err := runner.Init([]codersdk.WorkspaceAgentScript{{
DisplayName: "say-hello",
LogSourceID: uuid.New(),
Script: "echo hello",
}}, aAPI.ScriptCompleted)
require.NoError(t, err)
require.NoError(t, runner.Execute(ctx, agentscripts.ExecuteAllScripts))
runner.Close()

log := testutil.RequireRecvCtx(ctx, t, fLogger.logs)
require.Equal(t, "hello", log.Output)

timings := aAPI.GetTimings()
require.Equal(t, 1, len(timings))

timing := timings[0]
require.Equal(t, int32(0), timing.ExitCode)
require.GreaterOrEqual(t, timing.End.AsTime(), timing.Start.AsTime())
}

// TestCronClose exists because cron.Run() can happen after cron.Close().
Expand Down
15 changes: 15 additions & 0 deletionsagent/agenttest/client.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -170,6 +170,7 @@ type FakeAgentAPI struct {
logsCh chan<- *agentproto.BatchCreateLogsRequest
lifecycleStates []codersdk.WorkspaceAgentLifecycle
metadata map[string]agentsdk.Metadata
timings []*agentproto.Timing

getAnnouncementBannersFunc func() ([]codersdk.BannerConfig, error)
}
Expand All@@ -182,6 +183,12 @@ func (*FakeAgentAPI) GetServiceBanner(context.Context, *agentproto.GetServiceBan
return &agentproto.ServiceBanner{}, nil
}

func (f *FakeAgentAPI) GetTimings() []*agentproto.Timing {
f.Lock()
defer f.Unlock()
return slices.Clone(f.timings)
}

func (f *FakeAgentAPI) SetAnnouncementBannersFunc(fn func() ([]codersdk.BannerConfig, error)) {
f.Lock()
defer f.Unlock()
Expand DownExpand Up@@ -301,6 +308,14 @@ func (f *FakeAgentAPI) BatchCreateLogs(ctx context.Context, req *agentproto.Batc
return &agentproto.BatchCreateLogsResponse{}, nil
}

func (f *FakeAgentAPI) ScriptCompleted(_ context.Context, req *agentproto.WorkspaceAgentScriptCompletedRequest) (*agentproto.WorkspaceAgentScriptCompletedResponse, error) {
f.Lock()
f.timings = append(f.timings, req.Timing)
f.Unlock()

return &agentproto.WorkspaceAgentScriptCompletedResponse{}, nil
}

func NewFakeAgentAPI(t testing.TB, logger slog.Logger, manifest *agentproto.Manifest, statsCh chan *agentproto.Stats) *FakeAgentAPI {
return &FakeAgentAPI{
t: t,
Expand Down
Loading
Loading

[8]ページ先頭

©2009-2025 Movatter.jp