Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit0b18e45

Browse files
authored
fix: reduce excessive logging when database is unreachable (#17363)
Fixes#17045---------Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent2f99d70 commit0b18e45

File tree

10 files changed

+160
-20
lines changed

10 files changed

+160
-20
lines changed

‎coderd/coderd.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -675,10 +675,11 @@ func New(options *Options) *API {
675675
api.Auditor.Store(&options.Auditor)
676676
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
677677
dialer:=&InmemTailnetDialer{
678-
CoordPtr:&api.TailnetCoordinator,
679-
DERPFn:api.DERPMap,
680-
Logger:options.Logger,
681-
ClientID:uuid.New(),
678+
CoordPtr:&api.TailnetCoordinator,
679+
DERPFn:api.DERPMap,
680+
Logger:options.Logger,
681+
ClientID:uuid.New(),
682+
DatabaseHealthCheck:api.Database,
682683
}
683684
stn,err:=NewServerTailnet(api.ctx,
684685
options.Logger,

‎coderd/tailnet.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ import (
2424
"tailscale.com/tailcfg"
2525

2626
"cdr.dev/slog"
27+
2728
"github.com/coder/coder/v2/coderd/tracing"
2829
"github.com/coder/coder/v2/coderd/workspaceapps"
2930
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
31+
"github.com/coder/coder/v2/codersdk"
3032
"github.com/coder/coder/v2/codersdk/workspacesdk"
3133
"github.com/coder/coder/v2/site"
3234
"github.com/coder/coder/v2/tailnet"
@@ -534,16 +536,28 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
534536
returnm
535537
}
536538

539+
typePingerinterface {
540+
Ping(context.Context) (time.Duration,error)
541+
}
542+
537543
// InmemTailnetDialer is a tailnet.ControlProtocolDialer that connects to a Coordinator and DERPMap
538544
// service running in the same memory space.
539545
typeInmemTailnetDialerstruct {
540546
CoordPtr*atomic.Pointer[tailnet.Coordinator]
541547
DERPFnfunc()*tailcfg.DERPMap
542548
Logger slog.Logger
543549
ClientID uuid.UUID
550+
// DatabaseHealthCheck is used to validate that the store is reachable.
551+
DatabaseHealthCheckPinger
544552
}
545553

546-
func (a*InmemTailnetDialer)Dial(_ context.Context,_ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients,error) {
554+
func (a*InmemTailnetDialer)Dial(ctx context.Context,_ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients,error) {
555+
ifa.DatabaseHealthCheck!=nil {
556+
if_,err:=a.DatabaseHealthCheck.Ping(ctx);err!=nil {
557+
return tailnet.ControlProtocolClients{},xerrors.Errorf("%w: %v",codersdk.ErrDatabaseNotReachable,err)
558+
}
559+
}
560+
547561
coord:=a.CoordPtr.Load()
548562
ifcoord==nil {
549563
return tailnet.ControlProtocolClients{},xerrors.Errorf("tailnet coordinator not initialized")

‎coderd/tailnet_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,23 @@ import (
1111
"strconv"
1212
"sync/atomic"
1313
"testing"
14+
"time"
1415

1516
"github.com/google/uuid"
1617
"github.com/prometheus/client_golang/prometheus"
1718
"github.com/spf13/afero"
1819
"github.com/stretchr/testify/assert"
1920
"github.com/stretchr/testify/require"
2021
"go.opentelemetry.io/otel/trace"
22+
"golang.org/x/xerrors"
2123
"tailscale.com/tailcfg"
2224

2325
"github.com/coder/coder/v2/agent"
2426
"github.com/coder/coder/v2/agent/agenttest"
2527
"github.com/coder/coder/v2/agent/proto"
2628
"github.com/coder/coder/v2/coderd"
2729
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
30+
"github.com/coder/coder/v2/codersdk"
2831
"github.com/coder/coder/v2/codersdk/agentsdk"
2932
"github.com/coder/coder/v2/codersdk/workspacesdk"
3033
"github.com/coder/coder/v2/tailnet"
@@ -365,6 +368,44 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
365368
})
366369
}
367370

371+
funcTestDialFailure(t*testing.T) {
372+
t.Parallel()
373+
374+
// Setup.
375+
ctx:=testutil.Context(t,testutil.WaitShort)
376+
logger:=testutil.Logger(t)
377+
378+
// Given: a tailnet coordinator.
379+
coord:=tailnet.NewCoordinator(logger)
380+
t.Cleanup(func() {
381+
_=coord.Close()
382+
})
383+
coordPtr:= atomic.Pointer[tailnet.Coordinator]{}
384+
coordPtr.Store(&coord)
385+
386+
// Given: a fake DB healthchecker which will always fail.
387+
fch:=&failingHealthcheck{}
388+
389+
// When: dialing the in-memory coordinator.
390+
dialer:=&coderd.InmemTailnetDialer{
391+
CoordPtr:&coordPtr,
392+
Logger:logger,
393+
ClientID: uuid.UUID{5},
394+
DatabaseHealthCheck:fch,
395+
}
396+
_,err:=dialer.Dial(ctx,nil)
397+
398+
// Then: the error returned reflects the database has failed its healthcheck.
399+
require.ErrorIs(t,err,codersdk.ErrDatabaseNotReachable)
400+
}
401+
402+
typefailingHealthcheckstruct{}
403+
404+
func (failingHealthcheck)Ping(context.Context) (time.Duration,error) {
405+
// Simulate a database connection error.
406+
return0,xerrors.New("oops")
407+
}
408+
368409
typewrappedListenerstruct {
369410
net.Listener
370411
dialsint32

‎coderd/workspaceagents.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,16 @@ func (api *API) derpMapUpdates(rw http.ResponseWriter, r *http.Request) {
997997
func (api*API)workspaceAgentClientCoordinate(rw http.ResponseWriter,r*http.Request) {
998998
ctx:=r.Context()
999999

1000+
// Ensure the database is reachable before proceeding.
1001+
_,err:=api.Database.Ping(ctx)
1002+
iferr!=nil {
1003+
httpapi.Write(ctx,rw,http.StatusInternalServerError, codersdk.Response{
1004+
Message:codersdk.DatabaseNotReachable,
1005+
Detail:err.Error(),
1006+
})
1007+
return
1008+
}
1009+
10001010
// This route accepts user API key auth and workspace proxy auth. The moon actor has
10011011
// full permissions so should be able to pass this authz check.
10021012
workspace:=httpmw.WorkspaceParam(r)

‎codersdk/database.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package codersdk
2+
3+
import"golang.org/x/xerrors"
4+
5+
constDatabaseNotReachable="database not reachable"
6+
7+
varErrDatabaseNotReachable=xerrors.New(DatabaseNotReachable)

‎codersdk/workspacesdk/dialer.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,19 @@ import (
1111
"golang.org/x/xerrors"
1212

1313
"cdr.dev/slog"
14+
"github.com/coder/websocket"
15+
1416
"github.com/coder/coder/v2/buildinfo"
1517
"github.com/coder/coder/v2/codersdk"
1618
"github.com/coder/coder/v2/tailnet"
1719
"github.com/coder/coder/v2/tailnet/proto"
18-
"github.com/coder/websocket"
1920
)
2021

2122
varpermanentErrorStatuses= []int{
22-
http.StatusConflict,// returned if client/agent connections disabled (browser only)
23-
http.StatusBadRequest,// returned if API mismatch
24-
http.StatusNotFound,// returned if user doesn't have permission or agent doesn't exist
23+
http.StatusConflict,// returned if client/agent connections disabled (browser only)
24+
http.StatusBadRequest,// returned if API mismatch
25+
http.StatusNotFound,// returned if user doesn't have permission or agent doesn't exist
26+
http.StatusInternalServerError,// returned if database is not reachable,
2527
}
2628

2729
typeWebsocketDialerstruct {
@@ -89,6 +91,11 @@ func (w *WebsocketDialer) Dial(ctx context.Context, r tailnet.ResumeTokenControl
8991
"Ensure your client release version (%s, different than the API version) matches the server release version",
9092
buildinfo.Version())
9193
}
94+
95+
ifsdkErr.Message==codersdk.DatabaseNotReachable&&
96+
sdkErr.StatusCode()==http.StatusInternalServerError {
97+
err=xerrors.Errorf("%w: %v",codersdk.ErrDatabaseNotReachable,err)
98+
}
9299
}
93100
w.connected<-err
94101
return tailnet.ControlProtocolClients{},err

‎codersdk/workspacesdk/workspacesdk_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
package workspacesdk_test
22

33
import (
4+
"net/http"
5+
"net/http/httptest"
46
"net/url"
57
"testing"
68

79
"github.com/stretchr/testify/require"
810
"tailscale.com/tailcfg"
911

12+
"github.com/coder/websocket"
13+
14+
"github.com/coder/coder/v2/coderd/httpapi"
15+
"github.com/coder/coder/v2/codersdk"
1016
"github.com/coder/coder/v2/codersdk/agentsdk"
17+
"github.com/coder/coder/v2/codersdk/workspacesdk"
18+
"github.com/coder/coder/v2/testutil"
1119
)
1220

1321
funcTestWorkspaceRewriteDERPMap(t*testing.T) {
@@ -37,3 +45,30 @@ func TestWorkspaceRewriteDERPMap(t *testing.T) {
3745
require.Equal(t,"coconuts.org",node.HostName)
3846
require.Equal(t,44558,node.DERPPort)
3947
}
48+
49+
funcTestWorkspaceDialerFailure(t*testing.T) {
50+
t.Parallel()
51+
52+
// Setup.
53+
ctx:=testutil.Context(t,testutil.WaitShort)
54+
logger:=testutil.Logger(t)
55+
56+
// Given: a mock HTTP server which mimicks an unreachable database when calling the coordination endpoint.
57+
srv:=httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter,r*http.Request) {
58+
httpapi.Write(ctx,w,http.StatusInternalServerError, codersdk.Response{
59+
Message:codersdk.DatabaseNotReachable,
60+
Detail:"oops",
61+
})
62+
}))
63+
t.Cleanup(srv.Close)
64+
65+
u,err:=url.Parse(srv.URL)
66+
require.NoError(t,err)
67+
68+
// When: calling the coordination endpoint.
69+
dialer:=workspacesdk.NewWebsocketDialer(logger,u,&websocket.DialOptions{})
70+
_,err=dialer.Dial(ctx,nil)
71+
72+
// Then: an error indicating a database issue is returned, to conditionalize the behavior of the caller.
73+
require.ErrorIs(t,err,codersdk.ErrDatabaseNotReachable)
74+
}

‎provisionerd/provisionerd.go

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@ import (
2020
"golang.org/x/xerrors"
2121

2222
"cdr.dev/slog"
23+
"github.com/coder/retry"
24+
2325
"github.com/coder/coder/v2/coderd/tracing"
2426
"github.com/coder/coder/v2/codersdk"
2527
"github.com/coder/coder/v2/provisionerd/proto"
2628
"github.com/coder/coder/v2/provisionerd/runner"
2729
sdkproto"github.com/coder/coder/v2/provisionersdk/proto"
28-
"github.com/coder/retry"
2930
)
3031

3132
// Dialer represents the function to create a daemon client connection.
@@ -290,7 +291,7 @@ func (p *Server) acquireLoop() {
290291
deferp.wg.Done()
291292
deferfunc() {close(p.acquireDoneCh) }()
292293
ctx:=p.closeContext
293-
for {
294+
forretrier:=retry.New(10*time.Millisecond,1*time.Second);retrier.Wait(ctx);{
294295
ifp.acquireExit() {
295296
return
296297
}
@@ -299,7 +300,17 @@ func (p *Server) acquireLoop() {
299300
p.opts.Logger.Debug(ctx,"shut down before client (re) connected")
300301
return
301302
}
302-
p.acquireAndRunOne(client)
303+
err:=p.acquireAndRunOne(client)
304+
iferr!=nil&&ctx.Err()==nil {// Only log if context is not done.
305+
// Short-circuit: don't wait for the retry delay to exit, if required.
306+
ifp.acquireExit() {
307+
return
308+
}
309+
p.opts.Logger.Warn(ctx,"failed to acquire job, retrying",slog.F("delay",fmt.Sprintf("%vms",retrier.Delay.Milliseconds())),slog.Error(err))
310+
}else {
311+
// Reset the retrier after each successful acquisition.
312+
retrier.Reset()
313+
}
303314
}
304315
}
305316

@@ -318,7 +329,7 @@ func (p *Server) acquireExit() bool {
318329
returnfalse
319330
}
320331

321-
func (p*Server)acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
332+
func (p*Server)acquireAndRunOne(client proto.DRPCProvisionerDaemonClient)error{
322333
ctx:=p.closeContext
323334
p.opts.Logger.Debug(ctx,"start of acquireAndRunOne")
324335
job,err:=p.acquireGraceful(client)
@@ -327,15 +338,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
327338
iferrors.Is(err,context.Canceled)||
328339
errors.Is(err,yamux.ErrSessionShutdown)||
329340
errors.Is(err,fasthttputil.ErrInmemoryListenerClosed) {
330-
return
341+
returnerr
331342
}
332343

333344
p.opts.Logger.Warn(ctx,"provisionerd was unable to acquire job",slog.Error(err))
334-
return
345+
returnxerrors.Errorf("failed to acquire job: %w",err)
335346
}
336347
ifjob.JobId=="" {
337348
p.opts.Logger.Debug(ctx,"acquire job successfully canceled")
338-
return
349+
returnnil
339350
}
340351

341352
iflen(job.TraceMetadata)>0 {
@@ -392,9 +403,9 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
392403
Error:fmt.Sprintf("failed to connect to provisioner: %s",resp.Error),
393404
})
394405
iferr!=nil {
395-
p.opts.Logger.Error(ctx,"provisioner job failed",slog.F("job_id",job.JobId),slog.Error(err))
406+
p.opts.Logger.Error(ctx,"failed to reportprovisioner job failed",slog.F("job_id",job.JobId),slog.Error(err))
396407
}
397-
return
408+
returnxerrors.Errorf("failed to report provisioner job failed: %w",err)
398409
}
399410

400411
p.mutex.Lock()
@@ -418,6 +429,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
418429
p.mutex.Lock()
419430
p.activeJob=nil
420431
p.mutex.Unlock()
432+
returnnil
421433
}
422434

423435
// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut

‎site/src/api/typesGenerated.ts

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more aboutcustomizing how changed files appear on GitHub.

‎tailnet/controllers.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package tailnet
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"io"
78
"maps"
@@ -21,11 +22,12 @@ import (
2122
"tailscale.com/util/dnsname"
2223

2324
"cdr.dev/slog"
25+
"github.com/coder/quartz"
26+
"github.com/coder/retry"
27+
2428
"github.com/coder/coder/v2/coderd/util/ptr"
2529
"github.com/coder/coder/v2/codersdk"
2630
"github.com/coder/coder/v2/tailnet/proto"
27-
"github.com/coder/quartz"
28-
"github.com/coder/retry"
2931
)
3032

3133
// A Controller connects to the tailnet control plane, and then uses the control protocols to
@@ -1381,6 +1383,14 @@ func (c *Controller) Run(ctx context.Context) {
13811383
ifxerrors.Is(err,context.Canceled)||xerrors.Is(err,context.DeadlineExceeded) {
13821384
return
13831385
}
1386+
1387+
// If the database is unreachable by the control plane, there's not much we can do, so we'll just retry later.
1388+
iferrors.Is(err,codersdk.ErrDatabaseNotReachable) {
1389+
c.logger.Warn(c.ctx,"control plane lost connection to database, retrying",
1390+
slog.Error(err),slog.F("delay",fmt.Sprintf("%vms",retrier.Delay.Milliseconds())))
1391+
continue
1392+
}
1393+
13841394
errF:=slog.Error(err)
13851395
varsdkErr*codersdk.Error
13861396
ifxerrors.As(err,&sdkErr) {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp