- Notifications
You must be signed in to change notification settings - Fork924
chore: acquire lock for individual workspace transition#15859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
1213769
c45817b
46fa216
ae7fca9
6b133e7
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -3,6 +3,7 @@ package autobuild | ||
import ( | ||
"context" | ||
"database/sql" | ||
"fmt" | ||
"net/http" | ||
"sync" | ||
"sync/atomic" | ||
@@ -177,6 +178,15 @@ func (e *Executor) runOnce(t time.Time) Stats { | ||
err := e.db.InTx(func(tx database.Store) error { | ||
var err error | ||
ok, err := tx.TryAcquireLock(e.ctx, database.GenLockID(fmt.Sprintf("lifecycle-executor:%s", wsID))) | ||
if err != nil { | ||
return xerrors.Errorf("try acquire lifecycle executor lock: %w", err) | ||
DanielleMaywood marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
} | ||
if !ok { | ||
log.Debug(e.ctx, "unable to acquire lock for workspace, skipping") | ||
DanielleMaywood marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
return nil | ||
} | ||
// Re-check eligibility since the first check was outside the | ||
// transaction and the workspace settings may have changed. | ||
ws, err = tx.GetWorkspaceByID(e.ctx, wsID) | ||
@@ -389,7 +399,7 @@ func (e *Executor) runOnce(t time.Time) Stats { | ||
} | ||
return nil | ||
}() | ||
if err != nil&& !xerrors.Is(err, context.Canceled){ | ||
log.Error(e.ctx, "failed to transition workspace", slog.Error(err)) | ||
statsMu.Lock() | ||
stats.Errors[wsID] = err | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -18,6 +18,7 @@ import ( | ||
"github.com/coder/coder/v2/coderd/coderdtest" | ||
"github.com/coder/coder/v2/coderd/database" | ||
"github.com/coder/coder/v2/coderd/database/dbauthz" | ||
"github.com/coder/coder/v2/coderd/database/dbtestutil" | ||
"github.com/coder/coder/v2/coderd/notifications" | ||
"github.com/coder/coder/v2/coderd/notifications/notificationstest" | ||
"github.com/coder/coder/v2/coderd/schedule" | ||
@@ -72,6 +73,76 @@ func TestExecutorAutostartOK(t *testing.T) { | ||
require.Equal(t, template.AutostartRequirement.DaysOfWeek, []string{"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"}) | ||
} | ||
func TestMultipleLifecycleExecutors(t *testing.T) { | ||
t.Parallel() | ||
db, ps := dbtestutil.NewDB(t) | ||
var ( | ||
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *") | ||
// Create our first client | ||
tickCh = make(chan time.Time, 2) | ||
statsChA = make(chan autobuild.Stats) | ||
clientA = coderdtest.New(t, &coderdtest.Options{ | ||
IncludeProvisionerDaemon: true, | ||
AutobuildTicker: tickCh, | ||
AutobuildStats: statsChA, | ||
Database: db, | ||
Pubsub: ps, | ||
}) | ||
// ... And then our second client | ||
statsChB = make(chan autobuild.Stats) | ||
_ = coderdtest.New(t, &coderdtest.Options{ | ||
IncludeProvisionerDaemon: true, | ||
AutobuildTicker: tickCh, | ||
AutobuildStats: statsChB, | ||
Database: db, | ||
Pubsub: ps, | ||
}) | ||
// Now create a workspace (we can use either client, it doesn't matter) | ||
workspace = mustProvisionWorkspace(t, clientA, func(cwr *codersdk.CreateWorkspaceRequest) { | ||
cwr.AutostartSchedule = ptr.Ref(sched.String()) | ||
}) | ||
) | ||
// Have the workspace stopped so we can perform an autostart | ||
workspace = coderdtest.MustTransitionWorkspace(t, clientA, workspace.ID, database.WorkspaceTransitionStart, database.WorkspaceTransitionStop) | ||
// Get both clients to perform a lifecycle execution tick | ||
next := sched.Next(workspace.LatestBuild.CreatedAt) | ||
startCh := make(chan struct{}) | ||
go func() { | ||
<-startCh | ||
tickCh <- next | ||
}() | ||
go func() { | ||
<-startCh | ||
tickCh <- next | ||
}() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Is this potentially racy? We're testing that the lock acquire works but theoretically that might not happen if the first coderd grabs the job, completes it, and then the second one does. I doubt it matters as I suppose we're happy even if the try acquire is hit only a faction of the time, but thought I'd flag it anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Looking again, you're probably right. I ran the test with verbose logging and it looks like this all occurs within If the testdoesn't hit the lock, then we are likely to hit a flake. I'll have a go at increasing this time buffer. Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I think you might be able to reduce (but not eliminate) racyness by having a second e.g.
You might also be able to get both of them to tick very closely in time by sharing the same tick channel, and making it buffered with size 2. (Of course then you'd want to avoid closing the channel twice to avoid a panic) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I've gone with your proposal@johnstcn. It looks like for testing we just use an echo provisioner job, so getting that to take artificially longer for this specific test may not be a trivial task. | ||
close(startCh) | ||
// Now we want to check the stats for both clients | ||
statsA := <-statsChA | ||
statsB := <-statsChB | ||
// We expect there to be no errors | ||
assert.Len(t, statsA.Errors, 0) | ||
assert.Len(t, statsB.Errors, 0) | ||
// We also expect there to have been only one transition | ||
require.Equal(t, 1, len(statsA.Transitions)+len(statsB.Transitions)) | ||
stats := statsA | ||
if len(statsB.Transitions) == 1 { | ||
stats = statsB | ||
} | ||
// And we expect this transition to have been a start transition | ||
assert.Contains(t, stats.Transitions, workspace.ID) | ||
assert.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID]) | ||
} | ||
func TestExecutorAutostartTemplateUpdated(t *testing.T) { | ||
t.Parallel() | ||
Uh oh!
There was an error while loading.Please reload this page.