@@ -37,6 +37,7 @@ import (
3737"github.com/coder/coder/v2/codersdk/workspacesdk"
3838"github.com/coder/coder/v2/cryptorand"
3939"github.com/coder/coder/v2/pty"
40+ "github.com/coder/quartz"
4041"github.com/coder/retry"
4142"github.com/coder/serpent"
4243)
@@ -48,6 +49,7 @@ const (
4849var (
4950workspacePollInterval = time .Minute
5051autostopNotifyCountdown = []time.Duration {30 * time .Minute }
52+ gracefulShutdownTimeout = 5 * time .Second
5153)
5254
5355func (r * RootCmd )ssh ()* serpent.Command {
@@ -250,7 +252,16 @@ func (r *RootCmd) ssh() *serpent.Command {
250252if err != nil {
251253return xerrors .Errorf ("dial agent: %w" ,err )
252254}
253- if err = stack .push ("agent conn" ,conn );err != nil {
255+ if err = stack .push (
256+ "agent conn" ,
257+ // We set a long TCP timeout on SSH connections, which means if the underlying
258+ // network fails, the SSH layer can hang for a really long time trying to send a
259+ // shutdown message for any remote forwards (https://github.com/golang/go/issues/69484)
260+ // Normally, we want to tear stuff down top to bottom, but if we get stuck doing it
261+ // that way, this timeoutCloser will trip and close the underlying connection,
262+ // bottom-up.
263+ newTimeoutCloser (ctx ,logger ,gracefulShutdownTimeout ,conn ,quartz .NewReal ()),
264+ );err != nil {
254265return err
255266}
256267conn .AwaitReachable (ctx )
@@ -1085,3 +1096,49 @@ func getUsageAppName(usageApp string) codersdk.UsageAppName {
10851096
10861097return codersdk .UsageAppNameSSH
10871098}
1099+
1100+ type timeoutCloser struct {
1101+ target io.Closer
1102+ closeCalled chan struct {}
1103+
1104+ // for testing
1105+ clock quartz.Clock
1106+ }
1107+
1108+ func newTimeoutCloser (
1109+ ctx context.Context ,logger slog.Logger ,timeout time.Duration ,target io.Closer ,clock quartz.Clock ,
1110+ )* timeoutCloser {
1111+ b := & timeoutCloser {
1112+ target :target ,
1113+ closeCalled :make (chan struct {}),
1114+ clock :clock ,
1115+ }
1116+ go b .waitForCtxOrClose (ctx ,logger ,timeout )
1117+ return b
1118+ }
1119+
1120+ func (t * timeoutCloser )waitForCtxOrClose (ctx context.Context ,logger slog.Logger ,timeout time.Duration ) {
1121+ select {
1122+ case <- t .closeCalled :
1123+ return
1124+ case <- ctx .Done ():
1125+ }
1126+ tmr := t .clock .NewTimer (timeout ,"timeoutCloser" ,"waitForCtxOrClose" )
1127+ defer tmr .Stop ()
1128+ select {
1129+ case <- t .closeCalled :
1130+ return
1131+ case <- tmr .C :
1132+ logger .Warn (ctx ,"timed out waiting for graceful shutdown" )
1133+ err := t .target .Close ()
1134+ if err != nil {
1135+ logger .Debug (ctx ,"error closing target" ,slog .Error (err ))
1136+ }
1137+ }
1138+ }
1139+
1140+ // Close should only be called at most once, e.g. in the closerStack
1141+ func (t * timeoutCloser )Close ()error {
1142+ close (t .closeCalled )
1143+ return t .target .Close ()
1144+ }