Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc1816e3

Browse files
authored
fix(agent): fix deadlock if closed while starting listeners (#17329)
fixes#17328Fixes a deadlock if we close the Agent in the middle of starting listeners on the tailnet.
1 parent8faaa14 commitc1816e3

File tree

2 files changed

+78
-19
lines changed

2 files changed

+78
-19
lines changed

‎agent/agent.go‎

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -229,13 +229,21 @@ type agent struct {
229229
// we track 2 contexts and associated cancel functions: "graceful" which is Done when it is time
230230
// to start gracefully shutting down and "hard" which is Done when it is time to close
231231
// everything down (regardless of whether graceful shutdown completed).
232-
gracefulCtx context.Context
233-
gracefulCancel context.CancelFunc
234-
hardCtx context.Context
235-
hardCancel context.CancelFunc
236-
closeWaitGroup sync.WaitGroup
232+
gracefulCtx context.Context
233+
gracefulCancel context.CancelFunc
234+
hardCtx context.Context
235+
hardCancel context.CancelFunc
236+
237+
// closeMutex protects the following:
237238
closeMutex sync.Mutex
239+
closeWaitGroup sync.WaitGroup
238240
coordDisconnectedchanstruct{}
241+
closingbool
242+
// note that once the network is set to non-nil, it is never modified, as with the statsReporter. So, routines
243+
// that run after createOrUpdateNetwork and check the networkOK checkpoint do not need to hold the lock to use them.
244+
network*tailnet.Conn
245+
statsReporter*statsReporter
246+
// end fields protected by closeMutex
239247

240248
environmentVariablesmap[string]string
241249

@@ -259,9 +267,7 @@ type agent struct {
259267
reportConnectionsMu sync.Mutex
260268
reportConnections []*proto.ReportConnectionRequest
261269

262-
network*tailnet.Conn
263-
statsReporter*statsReporter
264-
logSender*agentsdk.LogSender
270+
logSender*agentsdk.LogSender
265271

266272
prometheusRegistry*prometheus.Registry
267273
// metrics are prometheus registered metrics that will be collected and
@@ -274,6 +280,8 @@ type agent struct {
274280
}
275281

276282
func (a*agent)TailnetConn()*tailnet.Conn {
283+
a.closeMutex.Lock()
284+
defera.closeMutex.Unlock()
277285
returna.network
278286
}
279287

@@ -1205,15 +1213,15 @@ func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(co
12051213
}
12061214
a.closeMutex.Lock()
12071215
// Re-check if agent was closed while initializing the network.
1208-
closed:=a.isClosed()
1209-
if!closed {
1216+
closing:=a.closing
1217+
if!closing {
12101218
a.network=network
12111219
a.statsReporter=newStatsReporter(a.logger,network,a)
12121220
}
12131221
a.closeMutex.Unlock()
1214-
ifclosed {
1222+
ifclosing {
12151223
_=network.Close()
1216-
returnxerrors.New("agent isclosed")
1224+
returnxerrors.New("agent isclosing")
12171225
}
12181226
}else {
12191227
// Update the wireguard IPs if the agent ID changed.
@@ -1328,8 +1336,8 @@ func (*agent) wireguardAddresses(agentID uuid.UUID) []netip.Prefix {
13281336
func (a*agent)trackGoroutine(fnfunc())error {
13291337
a.closeMutex.Lock()
13301338
defera.closeMutex.Unlock()
1331-
ifa.isClosed() {
1332-
returnxerrors.New("track conn goroutine: agent isclosed")
1339+
ifa.closing {
1340+
returnxerrors.New("track conn goroutine: agent isclosing")
13331341
}
13341342
a.closeWaitGroup.Add(1)
13351343
gofunc() {
@@ -1547,7 +1555,7 @@ func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTai
15471555
func (a*agent)setCoordDisconnected()chanstruct{} {
15481556
a.closeMutex.Lock()
15491557
defera.closeMutex.Unlock()
1550-
ifa.isClosed() {
1558+
ifa.closing {
15511559
returnnil
15521560
}
15531561
disconnected:=make(chanstruct{})
@@ -1772,7 +1780,10 @@ func (a *agent) HTTPDebug() http.Handler {
17721780

17731781
func (a*agent)Close()error {
17741782
a.closeMutex.Lock()
1775-
defera.closeMutex.Unlock()
1783+
network:=a.network
1784+
coordDisconnected:=a.coordDisconnected
1785+
a.closing=true
1786+
a.closeMutex.Unlock()
17761787
ifa.isClosed() {
17771788
returnnil
17781789
}
@@ -1849,7 +1860,7 @@ lifecycleWaitLoop:
18491860
select {
18501861
case<-a.hardCtx.Done():
18511862
a.logger.Warn(context.Background(),"timed out waiting for Coordinator RPC disconnect")
1852-
case<-a.coordDisconnected:
1863+
case<-coordDisconnected:
18531864
a.logger.Debug(context.Background(),"coordinator RPC disconnected")
18541865
}
18551866

@@ -1860,8 +1871,8 @@ lifecycleWaitLoop:
18601871
}
18611872

18621873
a.hardCancel()
1863-
ifa.network!=nil {
1864-
_=a.network.Close()
1874+
ifnetwork!=nil {
1875+
_=network.Close()
18651876
}
18661877
a.closeWaitGroup.Wait()
18671878

‎agent/agent_test.go‎

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,54 @@ func TestMain(m *testing.M) {
6868

6969
varsshPorts= []uint16{workspacesdk.AgentSSHPort,workspacesdk.AgentStandardSSHPort}
7070

71+
// TestAgent_CloseWhileStarting is a regression test for https://github.com/coder/coder/issues/17328
72+
funcTestAgent_ImmediateClose(t*testing.T) {
73+
t.Parallel()
74+
75+
ctx,cancel:=context.WithTimeout(context.Background(),testutil.WaitLong)
76+
defercancel()
77+
78+
logger:=slogtest.Make(t,&slogtest.Options{
79+
// Agent can drop errors when shutting down, and some, like the
80+
// fasthttplistener connection closed error, are unexported.
81+
IgnoreErrors:true,
82+
}).Leveled(slog.LevelDebug)
83+
manifest:= agentsdk.Manifest{
84+
AgentID:uuid.New(),
85+
AgentName:"test-agent",
86+
WorkspaceName:"test-workspace",
87+
WorkspaceID:uuid.New(),
88+
}
89+
90+
coordinator:=tailnet.NewCoordinator(logger)
91+
t.Cleanup(func() {
92+
_=coordinator.Close()
93+
})
94+
statsCh:=make(chan*proto.Stats,50)
95+
fs:=afero.NewMemMapFs()
96+
client:=agenttest.NewClient(t,logger.Named("agenttest"),manifest.AgentID,manifest,statsCh,coordinator)
97+
t.Cleanup(client.Close)
98+
99+
options:= agent.Options{
100+
Client:client,
101+
Filesystem:fs,
102+
Logger:logger.Named("agent"),
103+
ReconnectingPTYTimeout:0,
104+
EnvironmentVariables:map[string]string{},
105+
}
106+
107+
agentUnderTest:=agent.New(options)
108+
t.Cleanup(func() {
109+
_=agentUnderTest.Close()
110+
})
111+
112+
// wait until the agent has connected and is starting to find races in the startup code
113+
_=testutil.RequireRecvCtx(ctx,t,client.GetStartup())
114+
t.Log("Closing Agent")
115+
err:=agentUnderTest.Close()
116+
require.NoError(t,err)
117+
}
118+
71119
// NOTE: These tests only work when your default shell is bash for some reason.
72120

73121
funcTestAgent_Stats_SSH(t*testing.T) {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp