Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit15588ef

Browse files
committed
Checking for, and specifically handling, database unreachability in tailnet control protocol dialer
Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent6df164e commit15588ef

File tree

9 files changed

+194
-20
lines changed

9 files changed

+194
-20
lines changed

‎coderd/coderd.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,10 @@ func New(options *Options) *API {
679679
DERPFn:api.DERPMap,
680680
Logger:options.Logger,
681681
ClientID:uuid.New(),
682+
DatabaseHealthcheckFn:func(ctx context.Context)error {
683+
_,err:=api.Database.Ping(ctx)
684+
returnerr
685+
},
682686
}
683687
stn,err:=NewServerTailnet(api.ctx,
684688
options.Logger,

‎coderd/tailnet.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ import (
2424
"tailscale.com/tailcfg"
2525

2626
"cdr.dev/slog"
27+
2728
"github.com/coder/coder/v2/coderd/tracing"
2829
"github.com/coder/coder/v2/coderd/workspaceapps"
2930
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
31+
"github.com/coder/coder/v2/codersdk"
3032
"github.com/coder/coder/v2/codersdk/workspacesdk"
3133
"github.com/coder/coder/v2/site"
3234
"github.com/coder/coder/v2/tailnet"
@@ -537,13 +539,20 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
537539
// InmemTailnetDialer is a tailnet.ControlProtocolDialer that connects to a Coordinator and DERPMap
538540
// service running in the same memory space.
539541
typeInmemTailnetDialerstruct {
540-
CoordPtr*atomic.Pointer[tailnet.Coordinator]
541-
DERPFnfunc()*tailcfg.DERPMap
542-
Logger slog.Logger
543-
ClientID uuid.UUID
542+
CoordPtr*atomic.Pointer[tailnet.Coordinator]
543+
DERPFnfunc()*tailcfg.DERPMap
544+
Logger slog.Logger
545+
ClientID uuid.UUID
546+
DatabaseHealthcheckFnfunc(ctx context.Context)error
544547
}
545548

546-
func (a*InmemTailnetDialer)Dial(_ context.Context,_ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients,error) {
549+
func (a*InmemTailnetDialer)Dial(ctx context.Context,_ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients,error) {
550+
ifa.DatabaseHealthcheckFn!=nil {
551+
iferr:=a.DatabaseHealthcheckFn(ctx);err!=nil {
552+
return tailnet.ControlProtocolClients{},xerrors.Errorf("%s: %w",codersdk.DatabaseNotReachable,err)
553+
}
554+
}
555+
547556
coord:=a.CoordPtr.Load()
548557
ifcoord==nil {
549558
return tailnet.ControlProtocolClients{},xerrors.Errorf("tailnet coordinator not initialized")

‎coderd/tailnet_test.go

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/stretchr/testify/assert"
1919
"github.com/stretchr/testify/require"
2020
"go.opentelemetry.io/otel/trace"
21+
"golang.org/x/xerrors"
2122
"tailscale.com/tailcfg"
2223

2324
"github.com/coder/coder/v2/agent"
@@ -56,8 +57,7 @@ func TestServerTailnet_AgentConn_NoSTUN(t *testing.T) {
5657
defercancel()
5758

5859
// Connect through the ServerTailnet
59-
agents,serverTailnet:=setupServerTailnetAgent(t,1,
60-
tailnettest.DisableSTUN,tailnettest.DERPIsEmbedded)
60+
agents,serverTailnet:=setupServerTailnetAgent(t,1,withDERPAndStunOptions(tailnettest.DisableSTUN,tailnettest.DERPIsEmbedded))
6161
a:=agents[0]
6262

6363
conn,release,err:=serverTailnet.AgentConn(ctx,a.id)
@@ -340,7 +340,7 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
340340
ctx,cancel:=context.WithTimeout(context.Background(),testutil.WaitLong)
341341
defercancel()
342342

343-
agents,serverTailnet:=setupServerTailnetAgent(t,1,tailnettest.DisableSTUN)
343+
agents,serverTailnet:=setupServerTailnetAgent(t,1,withDERPAndStunOptions(tailnettest.DisableSTUN))
344344
a:=agents[0]
345345

346346
require.True(t,serverTailnet.Conn().GetBlockEndpoints(),"expected BlockEndpoints to be set")
@@ -365,6 +365,43 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
365365
})
366366
}
367367

368+
funcTestServerTailnet_Healthcheck(t*testing.T) {
369+
t.Parallel()
370+
371+
// Verifies that a non-nil healthcheck which returns a non-error response behaves as expected.
372+
t.Run("Passing",func(t*testing.T) {
373+
t.Parallel()
374+
375+
ctx:=testutil.Context(t,testutil.WaitMedium)
376+
fn:=func(ctx context.Context)error {returnnil }
377+
378+
agents,serverTailnet:=setupServerTailnetAgent(t,1,withHealthcheckFn(fn))
379+
380+
a:=agents[0]
381+
conn,release,err:=serverTailnet.AgentConn(ctx,a.id)
382+
t.Cleanup(release)
383+
require.NoError(t,err)
384+
assert.True(t,conn.AwaitReachable(ctx))
385+
})
386+
387+
// If the healthcheck fails, we have no insight into this at this level.
388+
// The dial against the control plane is retried, so we wait for the context to timeout as an indication that the
389+
// healthcheck is performing as expected.
390+
t.Run("Failing",func(t*testing.T) {
391+
t.Parallel()
392+
393+
ctx:=testutil.Context(t,testutil.WaitMedium)
394+
fn:=func(ctx context.Context)error {returnxerrors.Errorf("oops, db gone") }
395+
396+
agents,serverTailnet:=setupServerTailnetAgent(t,1,withHealthcheckFn(fn))
397+
398+
a:=agents[0]
399+
_,release,err:=serverTailnet.AgentConn(ctx,a.id)
400+
require.Nil(t,release)
401+
require.ErrorContains(t,err,"agent is unreachable")
402+
})
403+
}
404+
368405
typewrappedListenerstruct {
369406
net.Listener
370407
dialsint32
@@ -389,9 +426,36 @@ type agentWithID struct {
389426
agent.Agent
390427
}
391428

392-
funcsetupServerTailnetAgent(t*testing.T,agentNumint,opts...tailnettest.DERPAndStunOption) ([]agentWithID,*coderd.ServerTailnet) {
429+
typeserverOptionstruct {
430+
HealthcheckFnfunc(ctx context.Context)error
431+
DERPAndStunOptions []tailnettest.DERPAndStunOption
432+
}
433+
434+
funcwithHealthcheckFn(fnfunc(ctx context.Context)error)serverOption {
435+
returnserverOption{
436+
HealthcheckFn:fn,
437+
}
438+
}
439+
440+
funcwithDERPAndStunOptions(opts...tailnettest.DERPAndStunOption)serverOption {
441+
returnserverOption{
442+
DERPAndStunOptions:opts,
443+
}
444+
}
445+
446+
funcsetupServerTailnetAgent(t*testing.T,agentNumint,opts...serverOption) ([]agentWithID,*coderd.ServerTailnet) {
393447
logger:=testutil.Logger(t)
394-
derpMap,derpServer:=tailnettest.RunDERPAndSTUN(t,opts...)
448+
449+
varhealthcheckFnfunc(ctx context.Context)error
450+
varderpAndStunOptions []tailnettest.DERPAndStunOption
451+
for_,opt:=rangeopts {
452+
derpAndStunOptions=append(derpAndStunOptions,opt.DERPAndStunOptions...)
453+
ifopt.HealthcheckFn!=nil {
454+
healthcheckFn=opt.HealthcheckFn
455+
}
456+
}
457+
458+
derpMap,derpServer:=tailnettest.RunDERPAndSTUN(t,derpAndStunOptions...)
395459

396460
coord:=tailnet.NewCoordinator(logger)
397461
t.Cleanup(func() {
@@ -431,10 +495,11 @@ func setupServerTailnetAgent(t *testing.T, agentNum int, opts ...tailnettest.DER
431495
}
432496

433497
dialer:=&coderd.InmemTailnetDialer{
434-
CoordPtr:&coordPtr,
435-
DERPFn:func()*tailcfg.DERPMap {returnderpMap },
436-
Logger:logger,
437-
ClientID: uuid.UUID{5},
498+
CoordPtr:&coordPtr,
499+
DERPFn:func()*tailcfg.DERPMap {returnderpMap },
500+
Logger:logger,
501+
ClientID: uuid.UUID{5},
502+
DatabaseHealthcheckFn:healthcheckFn,
438503
}
439504
serverTailnet,err:=coderd.NewServerTailnet(
440505
context.Background(),

‎coderd/workspaceagents.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -995,6 +995,16 @@ func (api *API) derpMapUpdates(rw http.ResponseWriter, r *http.Request) {
995995
func (api*API)workspaceAgentClientCoordinate(rw http.ResponseWriter,r*http.Request) {
996996
ctx:=r.Context()
997997

998+
// Ensure the database is reachable before proceeding.
999+
_,err:=api.Database.Ping(ctx)
1000+
iferr!=nil {
1001+
httpapi.Write(ctx,rw,http.StatusInternalServerError, codersdk.Response{
1002+
Message:codersdk.DatabaseNotReachable,
1003+
Detail:err.Error(),
1004+
})
1005+
return
1006+
}
1007+
9981008
// This route accepts user API key auth and workspace proxy auth. The moon actor has
9991009
// full permissions so should be able to pass this authz check.
10001010
workspace:=httpmw.WorkspaceParam(r)

‎coderd/workspaceagents_test.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/coder/coder/v2/coderd/database/dbfake"
4646
"github.com/coder/coder/v2/coderd/database/dbgen"
4747
"github.com/coder/coder/v2/coderd/database/dbmem"
48+
"github.com/coder/coder/v2/coderd/database/dbtestutil"
4849
"github.com/coder/coder/v2/coderd/database/dbtime"
4950
"github.com/coder/coder/v2/coderd/database/pubsub"
5051
"github.com/coder/coder/v2/coderd/externalauth"
@@ -55,6 +56,7 @@ import (
5556
"github.com/coder/coder/v2/codersdk"
5657
"github.com/coder/coder/v2/codersdk/agentsdk"
5758
"github.com/coder/coder/v2/codersdk/workspacesdk"
59+
"github.com/coder/coder/v2/enterprise/coderd/coderdenttest"
5860
"github.com/coder/coder/v2/provisioner/echo"
5961
"github.com/coder/coder/v2/provisionersdk/proto"
6062
"github.com/coder/coder/v2/tailnet"
@@ -495,6 +497,45 @@ func TestWorkspaceAgentConnectRPC(t *testing.T) {
495497
// Then: we should get a 401 Unauthorized response
496498
require.Equal(t,http.StatusUnauthorized,sdkErr.StatusCode())
497499
})
500+
501+
// This test validates that the tailnet controller will retry connecting to the control plane until context timeout
502+
// when the dialer fails its healthcheck.
503+
t.Run("DatabaseUnreachable",func(t*testing.T) {
504+
t.Parallel()
505+
506+
store,ps:=dbtestutil.NewDB(t)
507+
508+
// Given: a database which will fail its Ping(ctx) call.
509+
// NOTE: The Ping(ctx) call is made by the Dialer.
510+
pdb:=&pingFailingDB{
511+
Store:store,
512+
}
513+
client,user:=coderdenttest.New(t,&coderdenttest.Options{
514+
Options:&coderdtest.Options{
515+
Database:pdb,
516+
Pubsub:ps,
517+
IncludeProvisionerDaemon:true,
518+
},
519+
})
520+
521+
// When: a workspace agent is setup and we try dial it.
522+
r:=dbfake.WorkspaceBuild(t,pdb, database.WorkspaceTable{
523+
OrganizationID:user.OrganizationID,
524+
OwnerID:user.UserID,
525+
}).WithAgent().Do()
526+
_=agenttest.New(t,client.URL,r.AgentToken)
527+
resources:=coderdtest.AwaitWorkspaceAgents(t,client,r.Workspace.ID)
528+
529+
// When: the db is marked as unhealthy (i.e. will fail its Ping).
530+
// This needs to be done *after* the server "starts" otherwise it'll fail straight away when trying to initialize.
531+
pdb.MarkUnhealthy()
532+
533+
// Then: the tailnet controller will continually try to dial the coordination endpoint, exceeding its context timeout.
534+
ctx:=testutil.Context(t,testutil.WaitMedium)
535+
conn,err:=workspacesdk.New(client).DialAgent(ctx,resources[0].Agents[0].ID,nil)
536+
require.ErrorContains(t,err,codersdk.DatabaseNotReachable)
537+
require.Nil(t,conn)
538+
})
498539
}
499540

500541
funcTestWorkspaceAgentTailnet(t*testing.T) {
@@ -2560,3 +2601,22 @@ func requireEqualOrBothNil[T any](t testing.TB, a, b *T) {
25602601
}
25612602
require.Equal(t,a,b)
25622603
}
2604+
2605+
typepingFailingDBstruct {
2606+
database.Store
2607+
2608+
unhealthybool
2609+
}
2610+
2611+
func (p*pingFailingDB)Ping(context.Context) (time.Duration,error) {
2612+
if!p.unhealthy {
2613+
returntime.Nanosecond,nil
2614+
}
2615+
2616+
// Simulate a database connection error.
2617+
return0,xerrors.New("oops")
2618+
}
2619+
2620+
func (p*pingFailingDB)MarkUnhealthy() {
2621+
p.unhealthy=true
2622+
}

‎codersdk/database.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package codersdk
2+
3+
import"errors"
4+
5+
constDatabaseNotReachable="database not reachable"
6+
7+
varErrDatabaseNotReachable=errors.New(DatabaseNotReachable)

‎codersdk/workspacesdk/dialer.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,19 @@ import (
1111
"golang.org/x/xerrors"
1212

1313
"cdr.dev/slog"
14+
"github.com/coder/websocket"
15+
1416
"github.com/coder/coder/v2/buildinfo"
1517
"github.com/coder/coder/v2/codersdk"
1618
"github.com/coder/coder/v2/tailnet"
1719
"github.com/coder/coder/v2/tailnet/proto"
18-
"github.com/coder/websocket"
1920
)
2021

2122
varpermanentErrorStatuses= []int{
22-
http.StatusConflict,// returned if client/agent connections disabled (browser only)
23-
http.StatusBadRequest,// returned if API mismatch
24-
http.StatusNotFound,// returned if user doesn't have permission or agent doesn't exist
23+
http.StatusConflict,// returned if client/agent connections disabled (browser only)
24+
http.StatusBadRequest,// returned if API mismatch
25+
http.StatusNotFound,// returned if user doesn't have permission or agent doesn't exist
26+
http.StatusInternalServerError,// returned if database is not reachable,
2527
}
2628

2729
typeWebsocketDialerstruct {
@@ -89,6 +91,11 @@ func (w *WebsocketDialer) Dial(ctx context.Context, r tailnet.ResumeTokenControl
8991
"Ensure your client release version (%s, different than the API version) matches the server release version",
9092
buildinfo.Version())
9193
}
94+
95+
ifsdkErr.Message==codersdk.DatabaseNotReachable&&
96+
sdkErr.StatusCode()==http.StatusInternalServerError {
97+
err=xerrors.Errorf("%s: %w",codersdk.DatabaseNotReachable,err)
98+
}
9299
}
93100
w.connected<-err
94101
return tailnet.ControlProtocolClients{},err

‎site/src/api/typesGenerated.ts

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more aboutcustomizing how changed files appear on GitHub.

‎tailnet/controllers.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ import (
2121
"tailscale.com/util/dnsname"
2222

2323
"cdr.dev/slog"
24+
"github.com/coder/quartz"
25+
"github.com/coder/retry"
26+
2427
"github.com/coder/coder/v2/coderd/util/ptr"
2528
"github.com/coder/coder/v2/codersdk"
2629
"github.com/coder/coder/v2/tailnet/proto"
27-
"github.com/coder/quartz"
28-
"github.com/coder/retry"
2930
)
3031

3132
// A Controller connects to the tailnet control plane, and then uses the control protocols to
@@ -1364,6 +1365,14 @@ func (c *Controller) Run(ctx context.Context) {
13641365
ifxerrors.Is(err,context.Canceled)||xerrors.Is(err,context.DeadlineExceeded) {
13651366
return
13661367
}
1368+
1369+
// If the database is unreachable by the control plane, there's not much we can do, so we'll just retry later.
1370+
ifstrings.Contains(err.Error(),codersdk.DatabaseNotReachable) {
1371+
c.logger.Warn(c.ctx,"control plane lost connection to database, retrying",
1372+
slog.Error(err),slog.F("retry_in_ms",retrier.Delay.Milliseconds()))
1373+
continue
1374+
}
1375+
13671376
errF:=slog.Error(err)
13681377
varsdkErr*codersdk.Error
13691378
ifxerrors.As(err,&sdkErr) {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp