Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

WIP feat: support retries/hard failure limit for prebuilds in reconcile prior to provisioner#21326

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Open
cstyan wants to merge5 commits intomain
base:main
Choose a base branch
Loading
fromcallum/prebuild-fail-interval
Open
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 224 additions & 1 deletionenterprise/coderd/prebuilds/reconcile.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -7,6 +7,7 @@ import (
"errors"
"fmt"
"math"
"net/http"
"strings"
"sync"
"sync/atomic"
Expand All@@ -16,6 +17,7 @@ import (
"github.com/hashicorp/go-multierror"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/sqlc-dev/pqtype"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"

Expand All@@ -37,6 +39,72 @@ import (
"github.com/coder/quartz"
)

// presetFailuresTracker tracks creation failures for presets to implement incremental backoff.
type presetFailuresTracker struct {
failures map[uuid.UUID]*presetCreationFailure
mu sync.RWMutex
clock quartz.Clock
}

// presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff.
type presetCreationFailure struct {
consecutiveFailures int
lastFailureAt time.Time
}

func newPresetFailuresTracker(clock quartz.Clock) *presetFailuresTracker {
return &presetFailuresTracker{
failures: make(map[uuid.UUID]*presetCreationFailure),
clock: clock,
}
}

// RecordFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
func (t *presetFailuresTracker) RecordFailure(presetID uuid.UUID) {
t.mu.Lock()
defer t.mu.Unlock()

failure, exists := t.failures[presetID]
if !exists {
failure = &presetCreationFailure{}
t.failures[presetID] = failure
}

failure.consecutiveFailures++
failure.lastFailureAt = t.clock.Now()
}

// RecordSuccess clears the failure tracking for a preset after a successful creation.
func (t *presetFailuresTracker) RecordSuccess(presetID uuid.UUID) {
t.mu.Lock()
defer t.mu.Unlock()

delete(t.failures, presetID)
}

// ShouldBackoff checks if we should delay creation attempts for a preset based on recent failures.
// It returns true and the backoff time if we should delay, false and zero time otherwise.
func (t *presetFailuresTracker) ShouldBackoff(presetID uuid.UUID, backoffInterval time.Duration) (bool, time.Time) {
t.mu.RLock()
defer t.mu.RUnlock()

failure, exists := t.failures[presetID]
if !exists || failure.consecutiveFailures == 0 {
return false, time.Time{}
}

// Calculate exponential backoff: backoffInterval * consecutiveFailures
// This gives us a linear backoff that increases with each consecutive failure.
backoffDuration := backoffInterval * time.Duration(failure.consecutiveFailures)
backoffUntil := failure.lastFailureAt.Add(backoffDuration)

if t.clock.Now().Before(backoffUntil) {
return true, backoffUntil
}

return false, time.Time{}
}

type StoreReconciler struct {
store database.Store
cfg codersdk.PrebuildsConfig
Expand All@@ -58,6 +126,9 @@ type StoreReconciler struct {
metrics *MetricsCollector
// Operational metrics
reconciliationDuration prometheus.Histogram

// Per-preset creation failure tracking for incremental backoff
failureTracker *presetFailuresTracker
}

var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
Expand DownExpand Up@@ -102,6 +173,7 @@ func NewStoreReconciler(store database.Store,
buildUsageChecker: buildUsageChecker,
done: make(chan struct{}, 1),
provisionNotifyCh: make(chan database.ProvisionerJob, 10),
failureTracker: newPresetFailuresTracker(clock),
}

if registerer != nil {
Expand All@@ -124,6 +196,22 @@ func NewStoreReconciler(store database.Store,
return reconciler
}

// RecordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
func (c *StoreReconciler) RecordCreationFailure(presetID uuid.UUID) {
c.failureTracker.RecordFailure(presetID)
}

// RecordCreationSuccess clears the failure tracking for a preset after a successful creation.
func (c *StoreReconciler) RecordCreationSuccess(presetID uuid.UUID) {
c.failureTracker.RecordSuccess(presetID)
}

// ShouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures.
// It returns true and the backoff time if we should delay, false and zero time otherwise.
func (c *StoreReconciler) ShouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) {
return c.failureTracker.ShouldBackoff(presetID, c.cfg.ReconciliationBackoffInterval.Value())
}

func (c *StoreReconciler) Run(ctx context.Context) {
reconciliationInterval := c.cfg.ReconciliationInterval.Value()
if reconciliationInterval <= 0 { // avoids a panic
Expand DownExpand Up@@ -643,6 +731,16 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
return nil

case prebuilds.ActionTypeCreate:
// Check if we should backoff on this preset due to recent creation failures
if shouldBackoff, backoffUntil := c.failureTracker.ShouldBackoff(ps.Preset.ID, c.cfg.ReconciliationBackoffInterval.Value()); shouldBackoff {
logger.Warn(ctx, "backing off prebuild creation due to recent failures",
slog.F("preset_id", ps.Preset.ID.String()),
slog.F("backoff_until", backoffUntil.Format(time.RFC3339)),
slog.F("backoff_secs", math.Round(backoffUntil.Sub(c.clock.Now()).Seconds())),
)
return nil
}

// Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes.
// See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/.
// This is obviously not comprehensive protection against this sort of problem, but this is one essential check.
Expand All@@ -666,7 +764,18 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
for range action.Create {
if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil {
logger.Error(ctx, "failed to create prebuild", slog.Error(err))

// Only apply backoff for transient errors (500-level).
// Config errors (400-level) should fail immediately and count toward the hard limit.
var buildErr wsbuilder.BuildError
if errors.As(err, &buildErr) && buildErr.Status == http.StatusInternalServerError {
c.failureTracker.RecordFailure(ps.Preset.ID)
}

multiErr.Errors = append(multiErr.Errors, err)
} else {
// Only clear failure tracking if we successfully created at least one prebuild
c.failureTracker.RecordSuccess(ps.Preset.ID)
}
}

Expand DownExpand Up@@ -734,7 +843,22 @@ func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltW
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))

provisionerJob, err = c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace, DeprovisionModeNormal)
return err
if err != nil {
// Check if this is a config error (non-transient) from wsbuilder.
// If so, create a failed build record so it counts toward the hard limit.
var buildErr wsbuilder.BuildError
if errors.As(err, &buildErr) && buildErr.Status != http.StatusInternalServerError {
// This is a config error (400-level). Create a failed build record
// so it counts toward the hard failure limit.
if failErr := c.createFailedBuildRecord(ctx, db, workspace, template, presetID, now, buildErr); failErr != nil {
c.logger.Warn(ctx, "failed to create failed build record for config error",
slog.Error(failErr),
slog.F("original_error", err.Error()))
}
}
return err
}
return nil
}, &database.TxOptions{
Isolation: sql.LevelRepeatableRead,
ReadOnly: false,
Expand All@@ -749,6 +873,105 @@ func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltW
return nil
}

// createFailedBuildRecord creates a workspace build and provisioner job record marked as failed.
// This allows config errors that fail at wsbuilder.Build() time to count toward the hard failure limit.
// The hard limit query checks workspace_latest_builds.job_status, which is derived from the provisioner job.
//
// IMPORTANT: This function must be called within a database transaction.
func (c *StoreReconciler) createFailedBuildRecord(
ctx context.Context,
db database.Store,
workspace database.Workspace,
template database.Template,
presetID uuid.UUID,
now time.Time,
buildErr wsbuilder.BuildError,
) error {
// Get template version job to populate provisioner job fields
templateVersion, err := db.GetTemplateVersionByID(ctx, template.ActiveVersionID)
if err != nil {
return xerrors.Errorf("get template version: %w", err)
}

templateVersionJob, err := db.GetProvisionerJobByID(ctx, templateVersion.JobID)
if err != nil {
return xerrors.Errorf("get template version job: %w", err)
}

// Create a provisioner job marked as failed
provisionerJobID := uuid.New()
_, err = db.InsertProvisionerJob(ctx, database.InsertProvisionerJobParams{
ID: provisionerJobID,
CreatedAt: now,
UpdatedAt: now,
InitiatorID: database.PrebuildsSystemUserID,
OrganizationID: template.OrganizationID,
Provisioner: template.Provisioner,
Type: database.ProvisionerJobTypeWorkspaceBuild,
StorageMethod: templateVersionJob.StorageMethod,
FileID: templateVersionJob.FileID,
Input: []byte("{}"), // Empty input since we never got to build
Tags: database.StringMap{},
TraceMetadata: pqtype.NullRawMessage{Valid: false},
LogsOverflowed: false,
})
if err != nil {
return xerrors.Errorf("insert provisioner job: %w", err)
}

// Mark the job as failed immediately
// nolint: gocritic // At this moment, we are pretending to be provisionerd.
err = db.UpdateProvisionerJobWithCompleteWithStartedAtByID(dbauthz.AsProvisionerd(ctx), database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{
ID: provisionerJobID,
UpdatedAt: now,
CompletedAt: sql.NullTime{Valid: true, Time: now},
StartedAt: sql.NullTime{Valid: true, Time: now},
Error: sql.NullString{Valid: true, String: buildErr.Message},
ErrorCode: sql.NullString{Valid: false},
})
if err != nil {
return xerrors.Errorf("mark provisioner job as failed: %w", err)
}

// Create workspace build linking to the failed job
workspaceBuildID := uuid.New()
buildNumber := int32(1) // This will be the first build for this workspace
if latestBuild, err := db.GetLatestWorkspaceBuildByWorkspaceID(ctx, workspace.ID); err == nil {
buildNumber = latestBuild.BuildNumber + 1
}

err = db.InsertWorkspaceBuild(ctx, database.InsertWorkspaceBuildParams{
ID: workspaceBuildID,
CreatedAt: now,
UpdatedAt: now,
WorkspaceID: workspace.ID,
TemplateVersionID: template.ActiveVersionID,
BuildNumber: buildNumber,
ProvisionerState: []byte("[]"), // Empty state since we never provisioned
InitiatorID: database.PrebuildsSystemUserID,
Transition: database.WorkspaceTransitionStart,
JobID: provisionerJobID,
Reason: database.BuildReasonInitiator,
Deadline: time.Time{},
MaxDeadline: time.Time{},
TemplateVersionPresetID: uuid.NullUUID{
UUID: presetID,
Valid: true,
},
})
if err != nil {
return xerrors.Errorf("insert workspace build: %w", err)
}

c.logger.Info(ctx, "created failed build record for config error",
slog.F("workspace_id", workspace.ID.String()),
slog.F("build_id", workspaceBuildID.String()),
slog.F("preset_id", presetID.String()),
slog.F("error", buildErr.Message))

return nil
}

// provisionDelete provisions a delete transition for a prebuilt workspace.
//
// If mode is DeprovisionModeOrphan, the builder will not send Terraform state to the provisioner.
Expand Down
Loading
Loading

[8]ページ先頭

©2009-2025 Movatter.jp