Dec 13, 2024 · Dec 10, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/coderd/autobuild/lifecycle_executor.go b/coderd/autobuild/lifecycle_executor.go
 import (
 "context"
 "database/sql"
 "fmt"
 "net/http"
 "sync"
 "sync/atomic"
 err := e.db.InTx(func(tx database.Store) error {
 var err error

 ok, err := tx.TryAcquireLock(e.ctx, database.GenLockID(fmt.Sprintf("lifecycle-executor:%s", wsID)))
 if err != nil {
 return xerrors.Errorf("try acquire lifecycle executor lock: %w", err)
 }
 if !ok {
 log.Debug(e.ctx, "unable to acquire lock for workspace, skipping")
 return nil
 }

 // Re-check eligibility since the first check was outside the
 // transaction and the workspace settings may have changed.
 ws, err = tx.GetWorkspaceByID(e.ctx, wsID)
 }
 return nil
 }()
 if err != nil {
 if err != nil&& !xerrors.Is(err, context.Canceled){
 log.Error(e.ctx, "failed to transition workspace", slog.Error(err))
 statsMu.Lock()
 stats.Errors[wsID] = err
diff --git a/coderd/autobuild/lifecycle_executor_test.go b/coderd/autobuild/lifecycle_executor_test.go
 "github.com/coder/coder/v2/coderd/coderdtest"
 "github.com/coder/coder/v2/coderd/database"
 "github.com/coder/coder/v2/coderd/database/dbauthz"
 "github.com/coder/coder/v2/coderd/database/dbtestutil"
 "github.com/coder/coder/v2/coderd/notifications"
 "github.com/coder/coder/v2/coderd/notifications/notificationstest"
 "github.com/coder/coder/v2/coderd/schedule"
 require.Equal(t, template.AutostartRequirement.DaysOfWeek, []string{"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"})
 }

 func TestMultipleLifecycleExecutors(t *testing.T) {
 t.Parallel()

 db, ps := dbtestutil.NewDB(t)

 var (
 sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
 // Create our first client
 tickCh   = make(chan time.Time, 2)
 statsChA = make(chan autobuild.Stats)
 clientA  = coderdtest.New(t, &coderdtest.Options{
 IncludeProvisionerDaemon: true,
 AutobuildTicker:          tickCh,
 AutobuildStats:           statsChA,
 Database:                 db,
 Pubsub:                   ps,
 })
 // ... And then our second client
 statsChB = make(chan autobuild.Stats)
 _        = coderdtest.New(t, &coderdtest.Options{
 IncludeProvisionerDaemon: true,
 AutobuildTicker:          tickCh,
 AutobuildStats:           statsChB,
 Database:                 db,
 Pubsub:                   ps,
 })
 // Now create a workspace (we can use either client, it doesn't matter)
 workspace = mustProvisionWorkspace(t, clientA, func(cwr *codersdk.CreateWorkspaceRequest) {
 cwr.AutostartSchedule = ptr.Ref(sched.String())
 })
 )

 // Have the workspace stopped so we can perform an autostart
 workspace = coderdtest.MustTransitionWorkspace(t, clientA, workspace.ID, database.WorkspaceTransitionStart, database.WorkspaceTransitionStop)

 // Get both clients to perform a lifecycle execution tick
 next := sched.Next(workspace.LatestBuild.CreatedAt)

 startCh := make(chan struct{})
 go func() {
 <-startCh
 tickCh <- next
 }()
 go func() {
 <-startCh
 tickCh <- next
 }()
 close(startCh)

 // Now we want to check the stats for both clients
 statsA := <-statsChA
 statsB := <-statsChB

 // We expect there to be no errors
 assert.Len(t, statsA.Errors, 0)
 assert.Len(t, statsB.Errors, 0)

 // We also expect there to have been only one transition
 require.Equal(t, 1, len(statsA.Transitions)+len(statsB.Transitions))

 stats := statsA
 if len(statsB.Transitions) == 1 {
 stats = statsB
 }

 // And we expect this transition to have been a start transition
 assert.Contains(t, stats.Transitions, workspace.ID)
 assert.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID])
 }

 func TestExecutorAutostartTemplateUpdated(t *testing.T) {
 t.Parallel()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,6 +3,7 @@ package autobuild
		import (
		"context"
		"database/sql"
		"fmt"
		"net/http"
		"sync"
		"sync/atomic"
Expand DownExpand Up		@@ -177,6 +178,15 @@ func (e *Executor) runOnce(t time.Time) Stats {
		err := e.db.InTx(func(tx database.Store) error {
		var err error

		ok, err := tx.TryAcquireLock(e.ctx, database.GenLockID(fmt.Sprintf("lifecycle-executor:%s", wsID)))
		if err != nil {
		return xerrors.Errorf("try acquire lifecycle executor lock: %w", err)
DanielleMaywood marked this conversation as resolved. Show resolvedHide resolved
		}
		if !ok {
		log.Debug(e.ctx, "unable to acquire lock for workspace, skipping")
DanielleMaywood marked this conversation as resolved. Show resolvedHide resolved
		return nil
		}

		// Re-check eligibility since the first check was outside the
		// transaction and the workspace settings may have changed.
		ws, err = tx.GetWorkspaceByID(e.ctx, wsID)
Expand DownExpand Up		@@ -389,7 +399,7 @@ func (e *Executor) runOnce(t time.Time) Stats {
		}
		return nil
		}()
		if err != nil {
		if err != nil&& !xerrors.Is(err, context.Canceled){
		log.Error(e.ctx, "failed to transition workspace", slog.Error(err))
		statsMu.Lock()
		stats.Errors[wsID] = err
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,6 +18,7 @@ import (
		"github.com/coder/coder/v2/coderd/coderdtest"
		"github.com/coder/coder/v2/coderd/database"
		"github.com/coder/coder/v2/coderd/database/dbauthz"
		"github.com/coder/coder/v2/coderd/database/dbtestutil"
		"github.com/coder/coder/v2/coderd/notifications"
		"github.com/coder/coder/v2/coderd/notifications/notificationstest"
		"github.com/coder/coder/v2/coderd/schedule"
Expand DownExpand Up		@@ -72,6 +73,76 @@ func TestExecutorAutostartOK(t *testing.T) {
		require.Equal(t, template.AutostartRequirement.DaysOfWeek, []string{"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"})
		}

		func TestMultipleLifecycleExecutors(t *testing.T) {
		t.Parallel()

		db, ps := dbtestutil.NewDB(t)

		var (
		sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
		// Create our first client
		tickCh = make(chan time.Time, 2)
		statsChA = make(chan autobuild.Stats)
		clientA = coderdtest.New(t, &coderdtest.Options{
		IncludeProvisionerDaemon: true,
		AutobuildTicker: tickCh,
		AutobuildStats: statsChA,
		Database: db,
		Pubsub: ps,
		})
		// ... And then our second client
		statsChB = make(chan autobuild.Stats)
		_ = coderdtest.New(t, &coderdtest.Options{
		IncludeProvisionerDaemon: true,
		AutobuildTicker: tickCh,
		AutobuildStats: statsChB,
		Database: db,
		Pubsub: ps,
		})
		// Now create a workspace (we can use either client, it doesn't matter)
		workspace = mustProvisionWorkspace(t, clientA, func(cwr *codersdk.CreateWorkspaceRequest) {
		cwr.AutostartSchedule = ptr.Ref(sched.String())
		})
		)

		// Have the workspace stopped so we can perform an autostart
		workspace = coderdtest.MustTransitionWorkspace(t, clientA, workspace.ID, database.WorkspaceTransitionStart, database.WorkspaceTransitionStop)

		// Get both clients to perform a lifecycle execution tick
		next := sched.Next(workspace.LatestBuild.CreatedAt)

		startCh := make(chan struct{})
		go func() {
		<-startCh
		tickCh <- next
		}()
		go func() {
		<-startCh
		tickCh <- next
		}()
Copy link Member mafredriDec 13, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Is this potentially racy? We're testing that the lock acquire works but theoretically that might not happen if the first coderd grabs the job, completes it, and then the second one does. I doubt it matters as I suppose we're happy even if the try acquire is hit only a faction of the time, but thought I'd flag it anyway. Copy link ContributorAuthor DanielleMaywoodDec 13, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Looking again, you're probably right. I ran the test with verbose logging and it looks like this all occurs within`0.05s`. If the testdoesn't hit the lock, then we are likely to hit a flake. I'll have a go at increasing this time buffer. Copy link Member johnstcnDec 13, 2024• edited Loading Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. I think you might be able to reduce (but not eliminate) racyness by having a second`chan struct{}` that you then close after starting both goroutines, making them both wait until it's closed to start. e.g. `startCh := make(chan struct{})go func() { <-startChtickChA <- nextclose(tickChA)}()go func() { <-startChtickChB <- nextclose(tickChB)}() close(startCh)` You might also be able to get both of them to tick very closely in time by sharing the same tick channel, and making it buffered with size 2. (Of course then you'd want to avoid closing the channel twice to avoid a panic) Copy link ContributorAuthor DanielleMaywoodDec 13, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. I've gone with your proposal@johnstcn. It looks like for testing we just use an echo provisioner job, so getting that to take artificially longer for this specific test may not be a trivial task.
		close(startCh)

		// Now we want to check the stats for both clients
		statsA := <-statsChA
		statsB := <-statsChB

		// We expect there to be no errors
		assert.Len(t, statsA.Errors, 0)
		assert.Len(t, statsB.Errors, 0)

		// We also expect there to have been only one transition
		require.Equal(t, 1, len(statsA.Transitions)+len(statsB.Transitions))

		stats := statsA
		if len(statsB.Transitions) == 1 {
		stats = statsB
		}

		// And we expect this transition to have been a start transition
		assert.Contains(t, stats.Transitions, workspace.ID)
		assert.Equal(t, database.WorkspaceTransitionStart, stats.Transitions[workspace.ID])
		}

		func TestExecutorAutostartTemplateUpdated(t *testing.T) {
		t.Parallel()

Expand Down