Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9ea3910

Browse files
fix: reduce excessive logging when database is unreachable (cherry-pick#17363) (#17411)
Co-authored-by: Danny Kopping <danny@coder.com>Fixes#17045
1 parent8b5adaa commit9ea3910

File tree

10 files changed

+160
-20
lines changed

10 files changed

+160
-20
lines changed

‎coderd/coderd.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -665,10 +665,11 @@ func New(options *Options) *API {
665665
api.Auditor.Store(&options.Auditor)
666666
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
667667
dialer:=&InmemTailnetDialer{
668-
CoordPtr:&api.TailnetCoordinator,
669-
DERPFn:api.DERPMap,
670-
Logger:options.Logger,
671-
ClientID:uuid.New(),
668+
CoordPtr:&api.TailnetCoordinator,
669+
DERPFn:api.DERPMap,
670+
Logger:options.Logger,
671+
ClientID:uuid.New(),
672+
DatabaseHealthCheck:api.Database,
672673
}
673674
stn,err:=NewServerTailnet(api.ctx,
674675
options.Logger,

‎coderd/tailnet.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ import (
2424
"tailscale.com/tailcfg"
2525

2626
"cdr.dev/slog"
27+
2728
"github.com/coder/coder/v2/coderd/tracing"
2829
"github.com/coder/coder/v2/coderd/workspaceapps"
2930
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
31+
"github.com/coder/coder/v2/codersdk"
3032
"github.com/coder/coder/v2/codersdk/workspacesdk"
3133
"github.com/coder/coder/v2/site"
3234
"github.com/coder/coder/v2/tailnet"
@@ -534,16 +536,28 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
534536
returnm
535537
}
536538

539+
typePingerinterface {
540+
Ping(context.Context) (time.Duration,error)
541+
}
542+
537543
// InmemTailnetDialer is a tailnet.ControlProtocolDialer that connects to a Coordinator and DERPMap
538544
// service running in the same memory space.
539545
typeInmemTailnetDialerstruct {
540546
CoordPtr*atomic.Pointer[tailnet.Coordinator]
541547
DERPFnfunc()*tailcfg.DERPMap
542548
Logger slog.Logger
543549
ClientID uuid.UUID
550+
// DatabaseHealthCheck is used to validate that the store is reachable.
551+
DatabaseHealthCheckPinger
544552
}
545553

546-
func (a*InmemTailnetDialer)Dial(_ context.Context,_ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients,error) {
554+
func (a*InmemTailnetDialer)Dial(ctx context.Context,_ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients,error) {
555+
ifa.DatabaseHealthCheck!=nil {
556+
if_,err:=a.DatabaseHealthCheck.Ping(ctx);err!=nil {
557+
return tailnet.ControlProtocolClients{},xerrors.Errorf("%w: %v",codersdk.ErrDatabaseNotReachable,err)
558+
}
559+
}
560+
547561
coord:=a.CoordPtr.Load()
548562
ifcoord==nil {
549563
return tailnet.ControlProtocolClients{},xerrors.Errorf("tailnet coordinator not initialized")

‎coderd/tailnet_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,23 @@ import (
1111
"strconv"
1212
"sync/atomic"
1313
"testing"
14+
"time"
1415

1516
"github.com/google/uuid"
1617
"github.com/prometheus/client_golang/prometheus"
1718
"github.com/spf13/afero"
1819
"github.com/stretchr/testify/assert"
1920
"github.com/stretchr/testify/require"
2021
"go.opentelemetry.io/otel/trace"
22+
"golang.org/x/xerrors"
2123
"tailscale.com/tailcfg"
2224

2325
"github.com/coder/coder/v2/agent"
2426
"github.com/coder/coder/v2/agent/agenttest"
2527
"github.com/coder/coder/v2/agent/proto"
2628
"github.com/coder/coder/v2/coderd"
2729
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
30+
"github.com/coder/coder/v2/codersdk"
2831
"github.com/coder/coder/v2/codersdk/agentsdk"
2932
"github.com/coder/coder/v2/codersdk/workspacesdk"
3033
"github.com/coder/coder/v2/tailnet"
@@ -365,6 +368,44 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
365368
})
366369
}
367370

371+
funcTestDialFailure(t*testing.T) {
372+
t.Parallel()
373+
374+
// Setup.
375+
ctx:=testutil.Context(t,testutil.WaitShort)
376+
logger:=testutil.Logger(t)
377+
378+
// Given: a tailnet coordinator.
379+
coord:=tailnet.NewCoordinator(logger)
380+
t.Cleanup(func() {
381+
_=coord.Close()
382+
})
383+
coordPtr:= atomic.Pointer[tailnet.Coordinator]{}
384+
coordPtr.Store(&coord)
385+
386+
// Given: a fake DB healthchecker which will always fail.
387+
fch:=&failingHealthcheck{}
388+
389+
// When: dialing the in-memory coordinator.
390+
dialer:=&coderd.InmemTailnetDialer{
391+
CoordPtr:&coordPtr,
392+
Logger:logger,
393+
ClientID: uuid.UUID{5},
394+
DatabaseHealthCheck:fch,
395+
}
396+
_,err:=dialer.Dial(ctx,nil)
397+
398+
// Then: the error returned reflects the database has failed its healthcheck.
399+
require.ErrorIs(t,err,codersdk.ErrDatabaseNotReachable)
400+
}
401+
402+
typefailingHealthcheckstruct{}
403+
404+
func (failingHealthcheck)Ping(context.Context) (time.Duration,error) {
405+
// Simulate a database connection error.
406+
return0,xerrors.New("oops")
407+
}
408+
368409
typewrappedListenerstruct {
369410
net.Listener
370411
dialsint32

‎coderd/workspaceagents.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,16 @@ func (api *API) derpMapUpdates(rw http.ResponseWriter, r *http.Request) {
989989
func (api*API)workspaceAgentClientCoordinate(rw http.ResponseWriter,r*http.Request) {
990990
ctx:=r.Context()
991991

992+
// Ensure the database is reachable before proceeding.
993+
_,err:=api.Database.Ping(ctx)
994+
iferr!=nil {
995+
httpapi.Write(ctx,rw,http.StatusInternalServerError, codersdk.Response{
996+
Message:codersdk.DatabaseNotReachable,
997+
Detail:err.Error(),
998+
})
999+
return
1000+
}
1001+
9921002
// This route accepts user API key auth and workspace proxy auth. The moon actor has
9931003
// full permissions so should be able to pass this authz check.
9941004
workspace:=httpmw.WorkspaceParam(r)

‎codersdk/database.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package codersdk
2+
3+
import"golang.org/x/xerrors"
4+
5+
constDatabaseNotReachable="database not reachable"
6+
7+
varErrDatabaseNotReachable=xerrors.New(DatabaseNotReachable)

‎codersdk/workspacesdk/dialer.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,19 @@ import (
1111
"golang.org/x/xerrors"
1212

1313
"cdr.dev/slog"
14+
"github.com/coder/websocket"
15+
1416
"github.com/coder/coder/v2/buildinfo"
1517
"github.com/coder/coder/v2/codersdk"
1618
"github.com/coder/coder/v2/tailnet"
1719
"github.com/coder/coder/v2/tailnet/proto"
18-
"github.com/coder/websocket"
1920
)
2021

2122
varpermanentErrorStatuses= []int{
22-
http.StatusConflict,// returned if client/agent connections disabled (browser only)
23-
http.StatusBadRequest,// returned if API mismatch
24-
http.StatusNotFound,// returned if user doesn't have permission or agent doesn't exist
23+
http.StatusConflict,// returned if client/agent connections disabled (browser only)
24+
http.StatusBadRequest,// returned if API mismatch
25+
http.StatusNotFound,// returned if user doesn't have permission or agent doesn't exist
26+
http.StatusInternalServerError,// returned if database is not reachable,
2527
}
2628

2729
typeWebsocketDialerstruct {
@@ -89,6 +91,11 @@ func (w *WebsocketDialer) Dial(ctx context.Context, r tailnet.ResumeTokenControl
8991
"Ensure your client release version (%s, different than the API version) matches the server release version",
9092
buildinfo.Version())
9193
}
94+
95+
ifsdkErr.Message==codersdk.DatabaseNotReachable&&
96+
sdkErr.StatusCode()==http.StatusInternalServerError {
97+
err=xerrors.Errorf("%w: %v",codersdk.ErrDatabaseNotReachable,err)
98+
}
9299
}
93100
w.connected<-err
94101
return tailnet.ControlProtocolClients{},err

‎codersdk/workspacesdk/workspacesdk_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
package workspacesdk_test
22

33
import (
4+
"net/http"
5+
"net/http/httptest"
46
"net/url"
57
"testing"
68

79
"github.com/stretchr/testify/require"
810
"tailscale.com/tailcfg"
911

12+
"github.com/coder/websocket"
13+
14+
"github.com/coder/coder/v2/coderd/httpapi"
15+
"github.com/coder/coder/v2/codersdk"
1016
"github.com/coder/coder/v2/codersdk/agentsdk"
17+
"github.com/coder/coder/v2/codersdk/workspacesdk"
18+
"github.com/coder/coder/v2/testutil"
1119
)
1220

1321
funcTestWorkspaceRewriteDERPMap(t*testing.T) {
@@ -37,3 +45,30 @@ func TestWorkspaceRewriteDERPMap(t *testing.T) {
3745
require.Equal(t,"coconuts.org",node.HostName)
3846
require.Equal(t,44558,node.DERPPort)
3947
}
48+
49+
funcTestWorkspaceDialerFailure(t*testing.T) {
50+
t.Parallel()
51+
52+
// Setup.
53+
ctx:=testutil.Context(t,testutil.WaitShort)
54+
logger:=testutil.Logger(t)
55+
56+
// Given: a mock HTTP server which mimicks an unreachable database when calling the coordination endpoint.
57+
srv:=httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter,r*http.Request) {
58+
httpapi.Write(ctx,w,http.StatusInternalServerError, codersdk.Response{
59+
Message:codersdk.DatabaseNotReachable,
60+
Detail:"oops",
61+
})
62+
}))
63+
t.Cleanup(srv.Close)
64+
65+
u,err:=url.Parse(srv.URL)
66+
require.NoError(t,err)
67+
68+
// When: calling the coordination endpoint.
69+
dialer:=workspacesdk.NewWebsocketDialer(logger,u,&websocket.DialOptions{})
70+
_,err=dialer.Dial(ctx,nil)
71+
72+
// Then: an error indicating a database issue is returned, to conditionalize the behavior of the caller.
73+
require.ErrorIs(t,err,codersdk.ErrDatabaseNotReachable)
74+
}

‎provisionerd/provisionerd.go

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@ import (
2020
"golang.org/x/xerrors"
2121

2222
"cdr.dev/slog"
23+
"github.com/coder/retry"
24+
2325
"github.com/coder/coder/v2/coderd/tracing"
2426
"github.com/coder/coder/v2/codersdk"
2527
"github.com/coder/coder/v2/provisionerd/proto"
2628
"github.com/coder/coder/v2/provisionerd/runner"
2729
sdkproto"github.com/coder/coder/v2/provisionersdk/proto"
28-
"github.com/coder/retry"
2930
)
3031

3132
// Dialer represents the function to create a daemon client connection.
@@ -290,7 +291,7 @@ func (p *Server) acquireLoop() {
290291
deferp.wg.Done()
291292
deferfunc() {close(p.acquireDoneCh) }()
292293
ctx:=p.closeContext
293-
for {
294+
forretrier:=retry.New(10*time.Millisecond,1*time.Second);retrier.Wait(ctx);{
294295
ifp.acquireExit() {
295296
return
296297
}
@@ -299,7 +300,17 @@ func (p *Server) acquireLoop() {
299300
p.opts.Logger.Debug(ctx,"shut down before client (re) connected")
300301
return
301302
}
302-
p.acquireAndRunOne(client)
303+
err:=p.acquireAndRunOne(client)
304+
iferr!=nil&&ctx.Err()==nil {// Only log if context is not done.
305+
// Short-circuit: don't wait for the retry delay to exit, if required.
306+
ifp.acquireExit() {
307+
return
308+
}
309+
p.opts.Logger.Warn(ctx,"failed to acquire job, retrying",slog.F("delay",fmt.Sprintf("%vms",retrier.Delay.Milliseconds())),slog.Error(err))
310+
}else {
311+
// Reset the retrier after each successful acquisition.
312+
retrier.Reset()
313+
}
303314
}
304315
}
305316

@@ -318,7 +329,7 @@ func (p *Server) acquireExit() bool {
318329
returnfalse
319330
}
320331

321-
func (p*Server)acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
332+
func (p*Server)acquireAndRunOne(client proto.DRPCProvisionerDaemonClient)error{
322333
ctx:=p.closeContext
323334
p.opts.Logger.Debug(ctx,"start of acquireAndRunOne")
324335
job,err:=p.acquireGraceful(client)
@@ -327,15 +338,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
327338
iferrors.Is(err,context.Canceled)||
328339
errors.Is(err,yamux.ErrSessionShutdown)||
329340
errors.Is(err,fasthttputil.ErrInmemoryListenerClosed) {
330-
return
341+
returnerr
331342
}
332343

333344
p.opts.Logger.Warn(ctx,"provisionerd was unable to acquire job",slog.Error(err))
334-
return
345+
returnxerrors.Errorf("failed to acquire job: %w",err)
335346
}
336347
ifjob.JobId=="" {
337348
p.opts.Logger.Debug(ctx,"acquire job successfully canceled")
338-
return
349+
returnnil
339350
}
340351

341352
iflen(job.TraceMetadata)>0 {
@@ -390,9 +401,9 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
390401
Error:fmt.Sprintf("failed to connect to provisioner: %s",resp.Error),
391402
})
392403
iferr!=nil {
393-
p.opts.Logger.Error(ctx,"provisioner job failed",slog.F("job_id",job.JobId),slog.Error(err))
404+
p.opts.Logger.Error(ctx,"failed to reportprovisioner job failed",slog.F("job_id",job.JobId),slog.Error(err))
394405
}
395-
return
406+
returnxerrors.Errorf("failed to report provisioner job failed: %w",err)
396407
}
397408

398409
p.mutex.Lock()
@@ -416,6 +427,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
416427
p.mutex.Lock()
417428
p.activeJob=nil
418429
p.mutex.Unlock()
430+
returnnil
419431
}
420432

421433
// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut

‎site/src/api/typesGenerated.ts

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more aboutcustomizing how changed files appear on GitHub.

‎tailnet/controllers.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package tailnet
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"io"
78
"maps"
@@ -20,11 +21,12 @@ import (
2021
"tailscale.com/util/dnsname"
2122

2223
"cdr.dev/slog"
24+
"github.com/coder/quartz"
25+
"github.com/coder/retry"
26+
2327
"github.com/coder/coder/v2/coderd/util/ptr"
2428
"github.com/coder/coder/v2/codersdk"
2529
"github.com/coder/coder/v2/tailnet/proto"
26-
"github.com/coder/quartz"
27-
"github.com/coder/retry"
2830
)
2931

3032
// A Controller connects to the tailnet control plane, and then uses the control protocols to
@@ -1362,6 +1364,14 @@ func (c *Controller) Run(ctx context.Context) {
13621364
ifxerrors.Is(err,context.Canceled)||xerrors.Is(err,context.DeadlineExceeded) {
13631365
return
13641366
}
1367+
1368+
// If the database is unreachable by the control plane, there's not much we can do, so we'll just retry later.
1369+
iferrors.Is(err,codersdk.ErrDatabaseNotReachable) {
1370+
c.logger.Warn(c.ctx,"control plane lost connection to database, retrying",
1371+
slog.Error(err),slog.F("delay",fmt.Sprintf("%vms",retrier.Delay.Milliseconds())))
1372+
continue
1373+
}
1374+
13651375
errF:=slog.Error(err)
13661376
varsdkErr*codersdk.Error
13671377
ifxerrors.As(err,&sdkErr) {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp