- Notifications
You must be signed in to change notification settings - Fork909
fix: reduce excessive logging when database is unreachable#17363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
3f95841
0448a74
0136b70
8d94c3c
f92e852
4b0fd6a
68867da
6f60cbc
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -675,10 +675,11 @@ func New(options *Options) *API { | ||
api.Auditor.Store(&options.Auditor) | ||
api.TailnetCoordinator.Store(&options.TailnetCoordinator) | ||
dialer := &InmemTailnetDialer{ | ||
CoordPtr: &api.TailnetCoordinator, | ||
DERPFn: api.DERPMap, | ||
Logger: options.Logger, | ||
ClientID: uuid.New(), | ||
DatabaseHealthCheck: api.Database, | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
} | ||
stn, err := NewServerTailnet(api.ctx, | ||
options.Logger, | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -24,9 +24,11 @@ import ( | ||
"tailscale.com/tailcfg" | ||
"cdr.dev/slog" | ||
"github.com/coder/coder/v2/coderd/tracing" | ||
"github.com/coder/coder/v2/coderd/workspaceapps" | ||
"github.com/coder/coder/v2/coderd/workspaceapps/appurl" | ||
"github.com/coder/coder/v2/codersdk" | ||
"github.com/coder/coder/v2/codersdk/workspacesdk" | ||
"github.com/coder/coder/v2/site" | ||
"github.com/coder/coder/v2/tailnet" | ||
@@ -534,16 +536,28 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra | ||
return m | ||
} | ||
type Pinger interface { | ||
Ping(context.Context) (time.Duration, error) | ||
} | ||
// InmemTailnetDialer is a tailnet.ControlProtocolDialer that connects to a Coordinator and DERPMap | ||
// service running in the same memory space. | ||
type InmemTailnetDialer struct { | ||
CoordPtr *atomic.Pointer[tailnet.Coordinator] | ||
DERPFn func() *tailcfg.DERPMap | ||
Logger slog.Logger | ||
ClientID uuid.UUID | ||
// DatabaseHealthCheck is used to validate that the store is reachable. | ||
DatabaseHealthCheck Pinger | ||
} | ||
func (a *InmemTailnetDialer) Dial(ctx context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) { | ||
if a.DatabaseHealthCheck != nil { | ||
if _, err := a.DatabaseHealthCheck.Ping(ctx); err != nil { | ||
return tailnet.ControlProtocolClients{}, xerrors.Errorf("%w: %v", codersdk.ErrDatabaseNotReachable, err) | ||
} | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
} | ||
coord := a.CoordPtr.Load() | ||
if coord == nil { | ||
return tailnet.ControlProtocolClients{}, xerrors.Errorf("tailnet coordinator not initialized") | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -11,20 +11,23 @@ import ( | ||
"strconv" | ||
"sync/atomic" | ||
"testing" | ||
"time" | ||
"github.com/google/uuid" | ||
"github.com/prometheus/client_golang/prometheus" | ||
"github.com/spf13/afero" | ||
"github.com/stretchr/testify/assert" | ||
"github.com/stretchr/testify/require" | ||
"go.opentelemetry.io/otel/trace" | ||
"golang.org/x/xerrors" | ||
"tailscale.com/tailcfg" | ||
"github.com/coder/coder/v2/agent" | ||
"github.com/coder/coder/v2/agent/agenttest" | ||
"github.com/coder/coder/v2/agent/proto" | ||
"github.com/coder/coder/v2/coderd" | ||
"github.com/coder/coder/v2/coderd/workspaceapps/appurl" | ||
"github.com/coder/coder/v2/codersdk" | ||
"github.com/coder/coder/v2/codersdk/agentsdk" | ||
"github.com/coder/coder/v2/codersdk/workspacesdk" | ||
"github.com/coder/coder/v2/tailnet" | ||
@@ -365,6 +368,44 @@ func TestServerTailnet_ReverseProxy(t *testing.T) { | ||
}) | ||
} | ||
func TestDialFailure(t *testing.T) { | ||
t.Parallel() | ||
// Setup. | ||
ctx := testutil.Context(t, testutil.WaitShort) | ||
logger := testutil.Logger(t) | ||
// Given: a tailnet coordinator. | ||
coord := tailnet.NewCoordinator(logger) | ||
t.Cleanup(func() { | ||
_ = coord.Close() | ||
}) | ||
coordPtr := atomic.Pointer[tailnet.Coordinator]{} | ||
coordPtr.Store(&coord) | ||
// Given: a fake DB healthchecker which will always fail. | ||
fch := &failingHealthcheck{} | ||
// When: dialing the in-memory coordinator. | ||
dialer := &coderd.InmemTailnetDialer{ | ||
CoordPtr: &coordPtr, | ||
Logger: logger, | ||
ClientID: uuid.UUID{5}, | ||
DatabaseHealthCheck: fch, | ||
} | ||
_, err := dialer.Dial(ctx, nil) | ||
// Then: the error returned reflects the database has failed its healthcheck. | ||
require.ErrorIs(t, err, codersdk.ErrDatabaseNotReachable) | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
} | ||
type failingHealthcheck struct{} | ||
func (failingHealthcheck) Ping(context.Context) (time.Duration, error) { | ||
// Simulate a database connection error. | ||
return 0, xerrors.New("oops") | ||
} | ||
type wrappedListener struct { | ||
net.Listener | ||
dials int32 | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package codersdk | ||
import "golang.org/x/xerrors" | ||
const DatabaseNotReachable = "database not reachable" | ||
var ErrDatabaseNotReachable = xerrors.New(DatabaseNotReachable) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -20,12 +20,13 @@ import ( | ||
"golang.org/x/xerrors" | ||
"cdr.dev/slog" | ||
"github.com/coder/retry" | ||
"github.com/coder/coder/v2/coderd/tracing" | ||
"github.com/coder/coder/v2/codersdk" | ||
"github.com/coder/coder/v2/provisionerd/proto" | ||
"github.com/coder/coder/v2/provisionerd/runner" | ||
sdkproto "github.com/coder/coder/v2/provisionersdk/proto" | ||
) | ||
// Dialer represents the function to create a daemon client connection. | ||
@@ -290,7 +291,7 @@ func (p *Server) acquireLoop() { | ||
defer p.wg.Done() | ||
defer func() { close(p.acquireDoneCh) }() | ||
ctx := p.closeContext | ||
forretrier := retry.New(10*time.Millisecond, 1*time.Second); retrier.Wait(ctx);{ | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
if p.acquireExit() { | ||
return | ||
} | ||
@@ -299,7 +300,17 @@ func (p *Server) acquireLoop() { | ||
p.opts.Logger.Debug(ctx, "shut down before client (re) connected") | ||
return | ||
} | ||
err := p.acquireAndRunOne(client) | ||
if err != nil && ctx.Err() == nil { // Only log if context is not done. | ||
// Short-circuit: don't wait for the retry delay to exit, if required. | ||
if p.acquireExit() { | ||
return | ||
} | ||
p.opts.Logger.Warn(ctx, "failed to acquire job, retrying", slog.F("delay", fmt.Sprintf("%vms", retrier.Delay.Milliseconds())), slog.Error(err)) | ||
} else { | ||
// Reset the retrier after each successful acquisition. | ||
retrier.Reset() | ||
} | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
} | ||
} | ||
@@ -318,7 +329,7 @@ func (p *Server) acquireExit() bool { | ||
return false | ||
} | ||
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient)error{ | ||
ctx := p.closeContext | ||
p.opts.Logger.Debug(ctx, "start of acquireAndRunOne") | ||
job, err := p.acquireGraceful(client) | ||
@@ -327,15 +338,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) { | ||
if errors.Is(err, context.Canceled) || | ||
errors.Is(err, yamux.ErrSessionShutdown) || | ||
errors.Is(err, fasthttputil.ErrInmemoryListenerClosed) { | ||
return err | ||
} | ||
p.opts.Logger.Warn(ctx, "provisionerd was unable to acquire job", slog.Error(err)) | ||
return xerrors.Errorf("failed to acquire job: %w", err) | ||
} | ||
if job.JobId == "" { | ||
p.opts.Logger.Debug(ctx, "acquire job successfully canceled") | ||
return nil | ||
} | ||
if len(job.TraceMetadata) > 0 { | ||
@@ -390,9 +401,9 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) { | ||
Error: fmt.Sprintf("failed to connect to provisioner: %s", resp.Error), | ||
}) | ||
if err != nil { | ||
p.opts.Logger.Error(ctx, "failed to reportprovisioner job failed", slog.F("job_id", job.JobId), slog.Error(err)) | ||
} | ||
return xerrors.Errorf("failed to report provisioner job failed: %w", err) | ||
} | ||
p.mutex.Lock() | ||
@@ -416,6 +427,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) { | ||
p.mutex.Lock() | ||
p.activeJob = nil | ||
p.mutex.Unlock() | ||
return nil | ||
} | ||
// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut | ||
Some generated files are not rendered by default. Learn more abouthow customized files appear on GitHub.
Uh oh!
There was an error while loading.Please reload this page.
Uh oh!
There was an error while loading.Please reload this page.