Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

feat: reinitialize agents when a prebuilt workspace is claimed#17475

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
SasSwart merged 49 commits intomainfromjjs/prebuilds-agent-reinit
May 14, 2025
Merged
Show file tree
Hide file tree
Changes from1 commit
Commits
Show all changes
49 commits
Select commitHold shift + click to select a range
c09c9b9
WIP: agent reinitialization
SasSwartApr 21, 2025
476fe71
fix assignment to nil map
SasSwartApr 21, 2025
8c8bca6
fix: ensure prebuilt workspace agent tokens are reused when a prebuil…
SasSwartApr 23, 2025
7ce4eea
test agent reinitialization
SasSwartApr 24, 2025
52ac64e
remove defunct metric
SasSwartApr 24, 2025
362db7c
Remove todo
SasSwartApr 25, 2025
dcc7379
test that we trigger workspace agent reinitialization under the right…
SasSwartApr 28, 2025
ff66b3f
slight improvements to a test
SasSwartApr 28, 2025
efff5d9
review notes to improve legibility
SasSwartApr 28, 2025
cebd5db
add an integration test for prebuilt workspace agent reinitialization
SasSwartApr 29, 2025
2679138
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwartApr 29, 2025
9feebef
enable the premium license in a prebuilds integration test
SasSwartApr 29, 2025
b117b5c
encapsulate WaitForReinitLoop for easier testing
SasSwartApr 30, 2025
a22b414
introduce unit testable abstraction layers
SasSwartApr 30, 2025
9bbd2c7
test workspace claim pubsub
SasSwartMay 1, 2025
5804201
add tests for agent reinitialization
SasSwartMay 1, 2025
7e8dcee
review notes
SasSwartMay 1, 2025
725f97b
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwartMay 1, 2025
a9b1567
make fmt lint
SasSwartMay 1, 2025
21ee970
remove go mod replace
SasSwartMay 1, 2025
e54d7e7
remove defunct logging
SasSwartMay 1, 2025
2799858
update dependency on terraform-provider-coder
SasSwartMay 2, 2025
1d93003
update dependency on terraform-provider-coder
SasSwartMay 2, 2025
763fc12
go mod tidy
SasSwartMay 2, 2025
0f879c7
make -B gen
SasSwartMay 2, 2025
61784c9
dont require ids to InsertPresetParameters
SasSwartMay 2, 2025
604eb27
dont require ids to InsertPresetParameters
SasSwartMay 2, 2025
bf4d2cf
fix: set the running agent token
dannykoppingMay 2, 2025
38b4f0d
fix: use http client without timeout like we do in connectRPCVersion
dannykoppingMay 5, 2025
20df538
review notes
SasSwartMay 6, 2025
4bb3b68
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwartMay 7, 2025
83972db
bump provisionerd proto version
SasSwartMay 7, 2025
146b158
fix: fetch the previous agent when we need its token for prebuilt wor…
SasSwartMay 12, 2025
5eb16cd
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwartMay 12, 2025
730d803
make -B lint
SasSwartMay 12, 2025
150adc0
Test GetWorkspaceAgentsByBuildID
SasSwartMay 12, 2025
b4ecf10
Rename GetWorkspaceAgentsByWorkspaceAndBuildNumber
SasSwartMay 12, 2025
3fa3edf
make gen
SasSwartMay 12, 2025
7e45919
fix a race condition
SasSwartMay 12, 2025
a632508
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwartMay 12, 2025
72125ec
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwartMay 13, 2025
b65eea7
fix provisionerdserver test for prebuild claims
SasSwartMay 13, 2025
e1339f3
fix race conditions
SasSwartMay 13, 2025
c1a8ba6
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwartMay 13, 2025
5363dcc
Make TestReinitializeAgent more robust
SasSwartMay 13, 2025
7ad9b6d
fix tests
SasSwartMay 14, 2025
394571d
make -B gen
SasSwartMay 14, 2025
890747b
remove a potential race in reinitialization testing in TestCompleteJob
SasSwartMay 14, 2025
b3870db
fix a potential race in TestReinit
SasSwartMay 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
PrevPrevious commit
NextNext commit
add tests for agent reinitialization
  • Loading branch information
@SasSwart
SasSwart committedMay 1, 2025
commit580420197f6a736e784dcd3c2f14360c9a716fc6
66 changes: 6 additions & 60 deletionscoderd/prebuilds/claim.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -2,23 +2,16 @@ package prebuilds

import (
"context"
"net/http"
"sync"

"github.com/google/uuid"
"golang.org/x/xerrors"

"cdr.dev/slog"
"github.com/coder/coder/v2/coderd/database/pubsub"
"github.com/coder/coder/v2/coderd/httpapi"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
)

type WorkspaceClaimPublisher interface {
PublishWorkspaceClaim(agentsdk.ReinitializationEvent)
}

func NewPubsubWorkspaceClaimPublisher(ps pubsub.Pubsub) *PubsubWorkspaceClaimPublisher {
return &PubsubWorkspaceClaimPublisher{ps: ps}
}
Expand All@@ -35,10 +28,6 @@ func (p PubsubWorkspaceClaimPublisher) PublishWorkspaceClaim(claim agentsdk.Rein
return nil
}

type WorkspaceClaimListener interface {
ListenForWorkspaceClaims(ctx context.Context, workspaceID uuid.UUID) (func(), <-chan agentsdk.ReinitializationEvent, error)
}

func NewPubsubWorkspaceClaimListener(ps pubsub.Pubsub, logger slog.Logger) *PubsubWorkspaceClaimListener {
return &PubsubWorkspaceClaimListener{ps: ps, logger: logger}
}
Expand All@@ -49,6 +38,12 @@ type PubsubWorkspaceClaimListener struct {
}

func (p PubsubWorkspaceClaimListener) ListenForWorkspaceClaims(ctx context.Context, workspaceID uuid.UUID) (func(), <-chan agentsdk.ReinitializationEvent, error) {
select {
case <-ctx.Done():
return func() {}, nil, ctx.Err()
default:
}

workspaceClaims := make(chan agentsdk.ReinitializationEvent, 1)
cancelSub, err := p.ps.Subscribe(agentsdk.PrebuildClaimedChannel(workspaceID), func(inner context.Context, id []byte) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

As we talked about on a call --- the pubsub is not considered reliable transport, and we can miss events.

This can lead to a situation where the agent misses the reinit signal and never reinitializes, even though it has been claimed.

The deep problem here is that we are using the PubSub to send material information (who the new owner is), rather than just a kick that there is new information available (the workspace has a new owner). In the latter case, when there is an error, we can recover by re-querying the database to find the owner, and then decide whether we need to signal the agent with new information. This requires the handler to keep track of the last owner it sent, but that's a trivial amount of memory to keep.

I don't necessarily think this needs to be fixed in this PR, since the plan is to move to a new "stream of manifest" architecture, but as we implement that, we need to keep error handling in mind on both sides: coderd recovers from a pubsub error by querying the database (or closing the connection if the database query fails), and then deciding whether there is something new to send. The agent recovers from a dropped connection by redialing, and then checking the new manifest against it's existing one to see if it needs to take any action.

Copy link
ContributorAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Thanks for identifying this. Let's defer it beyond this PR to the next release if there are no objections. I'd like to get to this as part of the manifest streaming work.

claimantID, err := uuid.ParseBytes(id)
Expand DownExpand Up@@ -91,52 +86,3 @@ func (p PubsubWorkspaceClaimListener) ListenForWorkspaceClaims(ctx context.Conte

return cancel, workspaceClaims, nil
}

func StreamAgentReinitEvents(ctx context.Context, logger slog.Logger, rw http.ResponseWriter, r *http.Request, reinitEvents <-chan agentsdk.ReinitializationEvent) {
sseSendEvent, sseSenderClosed, err := httpapi.ServerSentEventSender(rw, r)
if err != nil {
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Internal error setting up server-sent events.",
Detail: err.Error(),
})
return
}
// Prevent handler from returning until the sender is closed.
defer func() {
<-sseSenderClosed
}()

// An initial ping signals to the requester that the server is now ready
// and the client can begin servicing a channel with data.
_ = sseSendEvent(codersdk.ServerSentEvent{
Type: codersdk.ServerSentEventTypePing,
})

for {
select {
case <-ctx.Done():
return
case reinitEvent := <-reinitEvents:
err = sseSendEvent(codersdk.ServerSentEvent{
Type: codersdk.ServerSentEventTypeData,
Data: reinitEvent,
})
if err != nil {
logger.Warn(ctx, "failed to send SSE response to trigger reinit", slog.Error(err))
}
}
}
}

type MockClaimCoordinator interface{}

type ClaimListener interface{}
type PostgresClaimListener struct{}

type AgentReinitializer interface{}
type SSEAgentReinitializer struct{}

type ClaimCoordinator interface {
ClaimListener
AgentReinitializer
}
2 changes: 1 addition & 1 deletioncoderd/prebuilds/claim_test.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -79,12 +79,12 @@ func TestPubsubWorkspaceClaimListener(t *testing.T) {
listener := prebuilds.NewPubsubWorkspaceClaimListener(ps, slogtest.Make(t, nil))

ctx, cancel := context.WithCancel(context.Background())
cancel()

cancelFunc, claims, err := listener.ListenForWorkspaceClaims(ctx, uuid.New())
require.NoError(t, err)
defer cancelFunc()

cancel()
// Channel should be closed immediately due to context cancellation
select {
case _, ok := <-claims:
Expand Down
8 changes: 7 additions & 1 deletioncoderd/provisionerdserver/provisionerdserver.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -44,6 +44,7 @@ import (
"github.com/coder/coder/v2/coderd/tracing"
"github.com/coder/coder/v2/coderd/wspubsub"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/codersdk/drpc"
"github.com/coder/coder/v2/provisioner"
"github.com/coder/coder/v2/provisionerd/proto"
Expand DownExpand Up@@ -1749,7 +1750,12 @@ func (s *server) CompleteJob(ctx context.Context, completed *proto.CompletedJob)
slog.F("user", input.PrebuildClaimedByUser.String()),
slog.F("workspace_id", workspace.ID))

if err := prebuilds.PublishWorkspaceClaim(ctx, s.Pubsub, workspace.ID, input.PrebuildClaimedByUser); err != nil {
err = prebuilds.NewPubsubWorkspaceClaimPublisher(s.Pubsub).PublishWorkspaceClaim(agentsdk.ReinitializationEvent{
UserID: input.PrebuildClaimedByUser,
WorkspaceID: workspace.ID,
Reason: agentsdk.ReinitializeReasonPrebuildClaimed,
})
if err != nil {
s.Logger.Error(ctx, "failed to publish workspace claim event", slog.Error(err))
}
}
Expand Down
14 changes: 12 additions & 2 deletionscoderd/workspaceagents.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -1181,14 +1181,24 @@ func (api *API) workspaceAgentReinit(rw http.ResponseWriter, r *http.Request) {

log.Info(ctx, "agent waiting for reinit instruction")

cancel, reinitEvents, err := prebuilds.ListenForWorkspaceClaims(ctx, log, api.Pubsub, workspace.ID)
cancel, reinitEvents, err := prebuilds.NewPubsubWorkspaceClaimListener(api.Pubsub, log).ListenForWorkspaceClaims(ctx, workspace.ID)
if err != nil {
log.Error(ctx, "failed to subscribe to prebuild claimed channel", slog.Error(err))
httpapi.InternalServerError(rw, xerrors.New("failed to subscribe to prebuild claimed channel"))
return
}
defer cancel()
prebuilds.StreamAgentReinitEvents(ctx, log, rw, r, reinitEvents)

transmitter := agentsdk.NewSSEAgentReinitTransmitter(log, rw, r)

err = transmitter.Transmit(ctx, reinitEvents)
if err != nil {
log.Error(ctx, "failed to stream agent reinit events", slog.Error(err))
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Internal error streaming agent reinitialization events.",
Detail: err.Error(),
})
}
}

// convertProvisionedApps converts applications that are in the middle of provisioning process.
Expand Down
104 changes: 80 additions & 24 deletionscodersdk/agentsdk/agentsdk.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -24,6 +24,7 @@ import (

"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/apiversion"
"github.com/coder/coder/v2/coderd/httpapi"
"github.com/coder/coder/v2/codersdk"
drpcsdk "github.com/coder/coder/v2/codersdk/drpc"
tailnetproto "github.com/coder/coder/v2/tailnet/proto"
Expand DownExpand Up@@ -730,8 +731,86 @@ func (c *Client) WaitForReinit(ctx context.Context) (*ReinitializationEvent, err
return nil, codersdk.ReadBodyAsError(res)
}

nextEvent := codersdk.ServerSentEventReader(ctx, res.Body)
reinitEvent, err := NewSSEAgentReinitReceiver(res.Body).Receive(ctx)
if err != nil {
return nil, xerrors.Errorf("listening for reinitialization events: %w", err)
}
return reinitEvent, nil
}

func WaitForReinitLoop(ctx context.Context, logger slog.Logger, client *Client) <-chan ReinitializationEvent {
reinitEvents := make(chan ReinitializationEvent)

go func() {
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
logger.Debug(ctx, "waiting for agent reinitialization instructions")
reinitEvent, err := client.WaitForReinit(ctx)
if err != nil {
logger.Error(ctx, "failed to wait for agent reinitialization instructions", slog.Error(err))
continue
}
select {
case <-ctx.Done():
close(reinitEvents)
return
case reinitEvents <- *reinitEvent:
}
}
}()

return reinitEvents
}

func NewSSEAgentReinitTransmitter(logger slog.Logger, rw http.ResponseWriter, r *http.Request) *SSEAgentReinitTransmitter {
return &SSEAgentReinitTransmitter{logger: logger, rw: rw, r: r}
}

type SSEAgentReinitTransmitter struct {
rw http.ResponseWriter
r *http.Request
logger slog.Logger
}

func (s *SSEAgentReinitTransmitter) Transmit(ctx context.Context, reinitEvents <-chan ReinitializationEvent) error {
select {
case <-ctx.Done():
return ctx.Err()
default:
}

sseSendEvent, sseSenderClosed, err := httpapi.ServerSentEventSender(s.rw, s.r)
if err != nil {
return xerrors.Errorf("failed to create sse transmitter: %w", err)
}

for {
select {
case <-ctx.Done():
return ctx.Err()
case <-sseSenderClosed:
return xerrors.New("sse connection closed")
case reinitEvent := <-reinitEvents:
err := sseSendEvent(codersdk.ServerSentEvent{
Type: codersdk.ServerSentEventTypeData,
Data: reinitEvent,
})
if err != nil {
s.logger.Warn(ctx, "failed to send SSE response to trigger reinit", slog.Error(err))
}
}
}
}

func NewSSEAgentReinitReceiver(r io.ReadCloser) *SSEAgentReinitReceiver {
return &SSEAgentReinitReceiver{r: r}
}

type SSEAgentReinitReceiver struct {
r io.ReadCloser
}

func (s *SSEAgentReinitReceiver) Receive(ctx context.Context) (*ReinitializationEvent, error) {
nextEvent := codersdk.ServerSentEventReader(ctx, s.r)
for {
select {
case <-ctx.Done():
Expand DownExpand Up@@ -763,26 +842,3 @@ func (c *Client) WaitForReinit(ctx context.Context) (*ReinitializationEvent, err
}
}
}

func WaitForReinitLoop(ctx context.Context, logger slog.Logger, client *Client) <-chan ReinitializationEvent {
reinitEvents := make(chan ReinitializationEvent)

go func() {
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
logger.Debug(ctx, "waiting for agent reinitialization instructions")
reinitEvent, err := client.WaitForReinit(ctx)
if err != nil {
logger.Error(ctx, "failed to wait for agent reinitialization instructions", slog.Error(err))
continue
}
select {
case <-ctx.Done():
close(reinitEvents)
return
case reinitEvents <- *reinitEvent:
}
}
}()

return reinitEvents
}
Loading

[8]ページ先頭

©2009-2025 Movatter.jp