Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit205c43d

Browse files
authored
fix(enterprise): mark nodes from unhealthy coordinators as lost (#13123)
Instead of removing the mappings of unhealthy coordinators entirely,mark them as lost instead. This prevents peers from disappearing fromother peers if a coordinator misses a heartbeat.
1 parenta3c23ed commit205c43d

File tree

3 files changed

+104
-15
lines changed

3 files changed

+104
-15
lines changed

‎enterprise/tailnet/pgcoord.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1485,10 +1485,17 @@ func (h *heartbeats) filter(mappings []mapping) []mapping {
14851485
ok:=m.coordinator==h.self
14861486
if!ok {
14871487
_,ok=h.coordinators[m.coordinator]
1488+
if!ok {
1489+
// If a mapping exists to a coordinator lost to heartbeats,
1490+
// still add the mapping as LOST. If a coordinator misses
1491+
// heartbeats but a client is still connected to it, this may be
1492+
// the only mapping available for it. Newer mappings will take
1493+
// precedence.
1494+
m.kind=proto.CoordinateResponse_PeerUpdate_LOST
1495+
}
14881496
}
1489-
ifok {
1490-
out=append(out,m)
1491-
}
1497+
1498+
out=append(out,m)
14921499
}
14931500
returnout
14941501
}

‎enterprise/tailnet/pgcoord_internal_test.go

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"time"
1212

1313
"github.com/google/uuid"
14+
"github.com/stretchr/testify/assert"
1415
"github.com/stretchr/testify/require"
1516
"go.uber.org/mock/gomock"
1617
"golang.org/x/xerrors"
@@ -33,9 +34,9 @@ import (
3334
// make update-golden-files
3435
varUpdateGoldenFiles=flag.Bool("update",false,"update .golden files")
3536

36-
//TestHeartbeat_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
37+
//TestHeartbeats_Cleanup is internal so that we can overwrite the cleanup period and not wait an hour for the timed
3738
// cleanup.
38-
funcTestHeartbeat_Cleanup(t*testing.T) {
39+
funcTestHeartbeats_Cleanup(t*testing.T) {
3940
t.Parallel()
4041

4142
ctrl:=gomock.NewController(t)
@@ -78,6 +79,41 @@ func TestHeartbeat_Cleanup(t *testing.T) {
7879
close(waitForCleanup)
7980
}
8081

82+
funcTestHeartbeats_LostCoordinator_MarkLost(t*testing.T) {
83+
t.Parallel()
84+
85+
ctrl:=gomock.NewController(t)
86+
mStore:=dbmock.NewMockStore(ctrl)
87+
88+
ctx,cancel:=context.WithTimeout(context.Background(),testutil.WaitShort)
89+
defercancel()
90+
logger:=slogtest.Make(t,nil).Leveled(slog.LevelDebug)
91+
92+
uut:=&heartbeats{
93+
ctx:ctx,
94+
logger:logger,
95+
store:mStore,
96+
cleanupPeriod:time.Millisecond,
97+
coordinators:map[uuid.UUID]time.Time{
98+
uuid.New():time.Now(),
99+
},
100+
}
101+
102+
mpngs:= []mapping{{
103+
peer:uuid.New(),
104+
coordinator:uuid.New(),
105+
updatedAt:time.Now(),
106+
node:&proto.Node{},
107+
kind:proto.CoordinateResponse_PeerUpdate_NODE,
108+
}}
109+
110+
// Filter should still return the mapping without a coordinator, but marked
111+
// as LOST.
112+
got:=uut.filter(mpngs)
113+
require.Len(t,got,1)
114+
assert.Equal(t,proto.CoordinateResponse_PeerUpdate_LOST,got[0].kind)
115+
}
116+
81117
// TestLostPeerCleanupQueries tests that our SQL queries to clean up lost peers do what we expect,
82118
// that is, clean up peers and associated tunnels that have been lost for over 24 hours.
83119
funcTestLostPeerCleanupQueries(t*testing.T) {

‎enterprise/tailnet/pgcoord_test.go

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,52 @@ func TestPGCoordinatorSingle_MissedHeartbeats(t *testing.T) {
415415
assertEventuallyLost(ctx,t,store,client.id)
416416
}
417417

418+
funcTestPGCoordinatorSingle_MissedHeartbeats_NoDrop(t*testing.T) {
419+
t.Parallel()
420+
if!dbtestutil.WillUsePostgres() {
421+
t.Skip("test only with postgres")
422+
}
423+
store,ps:=dbtestutil.NewDB(t)
424+
ctx,cancel:=context.WithTimeout(context.Background(),testutil.WaitSuperLong)
425+
defercancel()
426+
logger:=slogtest.Make(t,nil).Leveled(slog.LevelDebug)
427+
428+
coordinator,err:=tailnet.NewPGCoord(ctx,logger,ps,store)
429+
require.NoError(t,err)
430+
defercoordinator.Close()
431+
432+
agentID:=uuid.New()
433+
434+
client:=agpltest.NewPeer(ctx,t,coordinator,"client")
435+
deferclient.Close(ctx)
436+
client.AddTunnel(agentID)
437+
438+
client.UpdateDERP(11)
439+
440+
// simulate a second coordinator via DB calls only --- our goal is to test
441+
// broken heart-beating, so we can't use a real coordinator
442+
fCoord2:=&fakeCoordinator{
443+
ctx:ctx,
444+
t:t,
445+
store:store,
446+
id:uuid.New(),
447+
}
448+
// simulate a single heartbeat, the coordinator is healthy
449+
fCoord2.heartbeat()
450+
451+
fCoord2.agentNode(agentID,&agpl.Node{PreferredDERP:12})
452+
// since it's healthy the client should get the new node.
453+
client.AssertEventuallyHasDERP(agentID,12)
454+
455+
// the heartbeat should then timeout and we'll get sent a LOST update, NOT a
456+
// disconnect.
457+
client.AssertEventuallyLost(agentID)
458+
459+
client.Close(ctx)
460+
461+
assertEventuallyLost(ctx,t,store,client.ID)
462+
}
463+
418464
funcTestPGCoordinatorSingle_SendsHeartbeats(t*testing.T) {
419465
t.Parallel()
420466
if!dbtestutil.WillUsePostgres() {
@@ -857,6 +903,16 @@ func newTestAgent(t *testing.T, coord agpl.CoordinatorV1, name string, id ...uui
857903
returna
858904
}
859905

906+
funcnewTestClient(t*testing.T,coord agpl.CoordinatorV1,agentID uuid.UUID,id...uuid.UUID)*testConn {
907+
c:=newTestConn(id)
908+
gofunc() {
909+
err:=coord.ServeClient(c.serverWS,c.id,agentID)
910+
assert.NoError(t,err)
911+
close(c.closeChan)
912+
}()
913+
returnc
914+
}
915+
860916
func (c*testConn)close()error {
861917
returnc.ws.Close()
862918
}
@@ -902,16 +958,6 @@ func (c *testConn) waitForClose(ctx context.Context, t *testing.T) {
902958
}
903959
}
904960

905-
funcnewTestClient(t*testing.T,coord agpl.CoordinatorV1,agentID uuid.UUID,id...uuid.UUID)*testConn {
906-
c:=newTestConn(id)
907-
gofunc() {
908-
err:=coord.ServeClient(c.serverWS,c.id,agentID)
909-
assert.NoError(t,err)
910-
close(c.closeChan)
911-
}()
912-
returnc
913-
}
914-
915961
funcassertEventuallyHasDERPs(ctx context.Context,t*testing.T,c*testConn,expected...int) {
916962
t.Helper()
917963
for {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp