Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit374127e

Browse files
committed
fix: prevent single replica proxies from staying unhealthy (#12641)
In the peer healthcheck code, when an error pinging peers is detected wewrite a "replicaErr" string with the error reason. However, if there areno peer replicas to ping we returned early without setting the string toempty. This would cause replicas that had peers (which were failing) andthen the peers left to permanently show an error until a new peerappeared.Also demotes DERP replica checking to a "warning" rather than an "error"which should prevent the primary from removing the proxy from the regionmap if DERP meshing is non-functional. This can happen without causingproblems if the peer is shutting down so we don't want to disrupteverything if there isn't an issue.(cherry picked from commitcf50461)
1 parent793df2e commit374127e

File tree

2 files changed

+145
-30
lines changed

2 files changed

+145
-30
lines changed

‎enterprise/wsproxy/wsproxy.go‎

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -449,8 +449,21 @@ func (s *Server) handleRegister(res wsproxysdk.RegisterWorkspaceProxyResponse) e
449449
}
450450

451451
func (s*Server)pingSiblingReplicas(replicas []codersdk.Replica) {
452+
ctx,cancel:=context.WithTimeout(s.ctx,10*time.Second)
453+
defercancel()
454+
455+
errStr:=pingSiblingReplicas(ctx,s.Logger,&s.replicaPingSingleflight,s.derpMeshTLSConfig,replicas)
456+
s.replicaErrMut.Lock()
457+
s.replicaErr=errStr
458+
s.replicaErrMut.Unlock()
459+
ifs.Options.ReplicaErrCallback!=nil {
460+
s.Options.ReplicaErrCallback(replicas,s.replicaErr)
461+
}
462+
}
463+
464+
funcpingSiblingReplicas(ctx context.Context,logger slog.Logger,sf*singleflight.Group,derpMeshTLSConfig*tls.Config,replicas []codersdk.Replica)string {
452465
iflen(replicas)==0 {
453-
return
466+
return""
454467
}
455468

456469
// Avoid pinging multiple times at once if the list hasn't changed.
@@ -462,18 +475,11 @@ func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
462475
singleflightStr:=strings.Join(relayURLs," ")// URLs can't contain spaces.
463476

464477
//nolint:dogsled
465-
_,_,_=s.replicaPingSingleflight.Do(singleflightStr,func() (any,error) {
466-
const (
467-
perReplicaTimeout=3*time.Second
468-
fullTimeout=10*time.Second
469-
)
470-
ctx,cancel:=context.WithTimeout(s.ctx,fullTimeout)
471-
defercancel()
472-
478+
errStrInterface,_,_:=sf.Do(singleflightStr,func() (any,error) {
473479
client:= http.Client{
474-
Timeout:perReplicaTimeout,
480+
Timeout:3*time.Second,
475481
Transport:&http.Transport{
476-
TLSClientConfig:s.derpMeshTLSConfig,
482+
TLSClientConfig:derpMeshTLSConfig,
477483
DisableKeepAlives:true,
478484
},
479485
}
@@ -485,7 +491,7 @@ func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
485491
err:=replicasync.PingPeerReplica(ctx,client,peer.RelayAddress)
486492
iferr!=nil {
487493
errs<-xerrors.Errorf("ping sibling replica %s (%s): %w",peer.Hostname,peer.RelayAddress,err)
488-
s.Logger.Warn(ctx,"failed to ping sibling replica, this could happen if the replica has shutdown",
494+
logger.Warn(ctx,"failed to ping sibling replica, this could happen if the replica has shutdown",
489495
slog.F("replica_hostname",peer.Hostname),
490496
slog.F("replica_relay_address",peer.RelayAddress),
491497
slog.Error(err),
@@ -504,20 +510,14 @@ func (s *Server) pingSiblingReplicas(replicas []codersdk.Replica) {
504510
}
505511
}
506512

507-
s.replicaErrMut.Lock()
508-
defers.replicaErrMut.Unlock()
509-
s.replicaErr=""
510-
iflen(replicaErrs)>0 {
511-
s.replicaErr=fmt.Sprintf("Failed to dial peers: %s",strings.Join(replicaErrs,", "))
512-
}
513-
ifs.Options.ReplicaErrCallback!=nil {
514-
s.Options.ReplicaErrCallback(replicas,s.replicaErr)
513+
iflen(replicaErrs)==0 {
514+
return"",nil
515515
}
516-
517-
//nolint:nilnil // we don't actually use the return value of the
518-
// singleflight here
519-
returnnil,nil
516+
returnfmt.Sprintf("Failed to dial peers: %s",strings.Join(replicaErrs,", ")),nil
520517
})
518+
519+
//nolint:forcetypeassert
520+
returnerrStrInterface.(string)
521521
}
522522

523523
func (s*Server)handleRegisterFailure(errerror) {
@@ -590,7 +590,8 @@ func (s *Server) healthReport(rw http.ResponseWriter, r *http.Request) {
590590

591591
s.replicaErrMut.Lock()
592592
ifs.replicaErr!="" {
593-
report.Errors=append(report.Errors,"High availability networking: it appears you are running more than one replica of the proxy, but the replicas are unable to establish a mesh for networking: "+s.replicaErr)
593+
report.Warnings=append(report.Warnings,
594+
"High availability networking: it appears you are running more than one replica of the proxy, but the replicas are unable to establish a mesh for networking: "+s.replicaErr)
594595
}
595596
s.replicaErrMut.Unlock()
596597

‎enterprise/wsproxy/wsproxy_test.go‎

Lines changed: 119 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ func TestWorkspaceProxyDERPMeshProbe(t *testing.T) {
563563
returnproxyRes
564564
}
565565

566-
registerBrokenProxy:=func(ctx context.Context,t*testing.T,primaryAccessURL*url.URL,accessURL,tokenstring) {
566+
registerBrokenProxy:=func(ctx context.Context,t*testing.T,primaryAccessURL*url.URL,accessURL,tokenstring)uuid.UUID{
567567
t.Helper()
568568
// Create a HTTP server that always replies with 500.
569569
srv:=httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter,r*http.Request) {
@@ -574,21 +574,23 @@ func TestWorkspaceProxyDERPMeshProbe(t *testing.T) {
574574
// Register a proxy.
575575
wsproxyClient:=wsproxysdk.New(primaryAccessURL)
576576
wsproxyClient.SetSessionToken(token)
577-
578577
hostname,err:=cryptorand.String(6)
579578
require.NoError(t,err)
579+
replicaID:=uuid.New()
580580
_,err=wsproxyClient.RegisterWorkspaceProxy(ctx, wsproxysdk.RegisterWorkspaceProxyRequest{
581581
AccessURL:accessURL,
582582
WildcardHostname:"",
583583
DerpEnabled:true,
584584
DerpOnly:false,
585-
ReplicaID:uuid.New(),
585+
ReplicaID:replicaID,
586586
ReplicaHostname:hostname,
587587
ReplicaError:"",
588588
ReplicaRelayAddress:srv.URL,
589589
Version:buildinfo.Version(),
590590
})
591591
require.NoError(t,err)
592+
593+
returnreplicaID
592594
}
593595

594596
t.Run("ProbeOK",func(t*testing.T) {
@@ -781,8 +783,120 @@ func TestWorkspaceProxyDERPMeshProbe(t *testing.T) {
781783
resp.Body.Close()
782784
require.NoError(t,err)
783785

784-
require.Len(t,respJSON.Errors,1,"proxy is healthy")
785-
require.Contains(t,respJSON.Errors[0],"High availability networking")
786+
require.Len(t,respJSON.Warnings,1,"proxy is healthy")
787+
require.Contains(t,respJSON.Warnings[0],"High availability networking")
788+
})
789+
790+
// This test catches a regression we detected on dogfood which caused
791+
// proxies to remain unhealthy after a mesh failure if they dropped to zero
792+
// siblings after the failure.
793+
t.Run("HealthyZero",func(t*testing.T) {
794+
t.Parallel()
795+
796+
deploymentValues:=coderdtest.DeploymentValues(t)
797+
deploymentValues.Experiments= []string{
798+
"*",
799+
}
800+
801+
client,closer,api,_:=coderdenttest.NewWithAPI(t,&coderdenttest.Options{
802+
Options:&coderdtest.Options{
803+
DeploymentValues:deploymentValues,
804+
AppHostname:"*.primary.test.coder.com",
805+
IncludeProvisionerDaemon:true,
806+
RealIPConfig:&httpmw.RealIPConfig{
807+
TrustedOrigins: []*net.IPNet{{
808+
IP:net.ParseIP("127.0.0.1"),
809+
Mask:net.CIDRMask(8,32),
810+
}},
811+
TrustedHeaders: []string{
812+
"CF-Connecting-IP",
813+
},
814+
},
815+
},
816+
LicenseOptions:&coderdenttest.LicenseOptions{
817+
Features: license.Features{
818+
codersdk.FeatureWorkspaceProxy:1,
819+
},
820+
},
821+
})
822+
t.Cleanup(func() {
823+
_=closer.Close()
824+
})
825+
826+
proxyURL,err:=url.Parse("https://proxy2.test.coder.com")
827+
require.NoError(t,err)
828+
829+
// Create 1 real proxy replica.
830+
replicaPingErr:=make(chanstring,4)
831+
proxy:=coderdenttest.NewWorkspaceProxyReplica(t,api,client,&coderdenttest.ProxyOptions{
832+
Name:"proxy-2",
833+
ProxyURL:proxyURL,
834+
ReplicaPingCallback:func(replicas []codersdk.Replica,errstring) {
835+
replicaPingErr<-err
836+
},
837+
})
838+
839+
ctx:=testutil.Context(t,testutil.WaitLong)
840+
otherReplicaID:=registerBrokenProxy(ctx,t,api.AccessURL,proxyURL.String(),proxy.Options.ProxySessionToken)
841+
842+
// Force the proxy to re-register immediately.
843+
err=proxy.RegisterNow()
844+
require.NoError(t,err,"failed to force proxy to re-register")
845+
846+
// Wait for the ping to fail.
847+
for {
848+
replicaErr:=testutil.RequireRecvCtx(ctx,t,replicaPingErr)
849+
t.Log("replica ping error:",replicaErr)
850+
ifreplicaErr!="" {
851+
break
852+
}
853+
}
854+
855+
// GET /healthz-report
856+
u:=proxy.ServerURL.ResolveReference(&url.URL{Path:"/healthz-report"})
857+
req,err:=http.NewRequestWithContext(ctx,http.MethodGet,u.String(),nil)
858+
require.NoError(t,err)
859+
resp,err:=http.DefaultClient.Do(req)
860+
require.NoError(t,err)
861+
varrespJSON codersdk.ProxyHealthReport
862+
err=json.NewDecoder(resp.Body).Decode(&respJSON)
863+
resp.Body.Close()
864+
require.NoError(t,err)
865+
require.Len(t,respJSON.Warnings,1,"proxy is healthy")
866+
require.Contains(t,respJSON.Warnings[0],"High availability networking")
867+
868+
// Deregister the other replica.
869+
wsproxyClient:=wsproxysdk.New(api.AccessURL)
870+
wsproxyClient.SetSessionToken(proxy.Options.ProxySessionToken)
871+
err=wsproxyClient.DeregisterWorkspaceProxy(ctx, wsproxysdk.DeregisterWorkspaceProxyRequest{
872+
ReplicaID:otherReplicaID,
873+
})
874+
require.NoError(t,err)
875+
876+
// Force the proxy to re-register immediately.
877+
err=proxy.RegisterNow()
878+
require.NoError(t,err,"failed to force proxy to re-register")
879+
880+
// Wait for the ping to be skipped.
881+
for {
882+
replicaErr:=testutil.RequireRecvCtx(ctx,t,replicaPingErr)
883+
t.Log("replica ping error:",replicaErr)
884+
// Should be empty because there are no more peers. This was where
885+
// the regression was.
886+
ifreplicaErr=="" {
887+
break
888+
}
889+
}
890+
891+
// GET /healthz-report
892+
req,err=http.NewRequestWithContext(ctx,http.MethodGet,u.String(),nil)
893+
require.NoError(t,err)
894+
resp,err=http.DefaultClient.Do(req)
895+
require.NoError(t,err)
896+
err=json.NewDecoder(resp.Body).Decode(&respJSON)
897+
resp.Body.Close()
898+
require.NoError(t,err)
899+
require.Len(t,respJSON.Warnings,0,"proxy is unhealthy")
786900
})
787901
}
788902

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp