@@ -20,12 +20,13 @@ import (
2020"golang.org/x/xerrors"
2121
2222"cdr.dev/slog"
23+ "github.com/coder/retry"
24+
2325"github.com/coder/coder/v2/coderd/tracing"
2426"github.com/coder/coder/v2/codersdk"
2527"github.com/coder/coder/v2/provisionerd/proto"
2628"github.com/coder/coder/v2/provisionerd/runner"
2729sdkproto"github.com/coder/coder/v2/provisionersdk/proto"
28- "github.com/coder/retry"
2930)
3031
3132// Dialer represents the function to create a daemon client connection.
@@ -290,7 +291,7 @@ func (p *Server) acquireLoop() {
290291defer p .wg .Done ()
291292defer func () {close (p .acquireDoneCh ) }()
292293ctx := p .closeContext
293- for {
294+ for retrier := retry . New ( 10 * time . Millisecond , 1 * time . Second ); retrier . Wait ( ctx ); {
294295if p .acquireExit () {
295296return
296297}
@@ -299,7 +300,17 @@ func (p *Server) acquireLoop() {
299300p .opts .Logger .Debug (ctx ,"shut down before client (re) connected" )
300301return
301302}
302- p .acquireAndRunOne (client )
303+ err := p .acquireAndRunOne (client )
304+ if err != nil && ctx .Err ()== nil {// Only log if context is not done.
305+ // Short-circuit: don't wait for the retry delay to exit, if required.
306+ if p .acquireExit () {
307+ return
308+ }
309+ p .opts .Logger .Warn (ctx ,"failed to acquire job, retrying" ,slog .F ("delay" ,fmt .Sprintf ("%vms" ,retrier .Delay .Milliseconds ())),slog .Error (err ))
310+ }else {
311+ // Reset the retrier after each successful acquisition.
312+ retrier .Reset ()
313+ }
303314}
304315}
305316
@@ -318,7 +329,7 @@ func (p *Server) acquireExit() bool {
318329return false
319330}
320331
321- func (p * Server )acquireAndRunOne (client proto.DRPCProvisionerDaemonClient ) {
332+ func (p * Server )acquireAndRunOne (client proto.DRPCProvisionerDaemonClient )error {
322333ctx := p .closeContext
323334p .opts .Logger .Debug (ctx ,"start of acquireAndRunOne" )
324335job ,err := p .acquireGraceful (client )
@@ -327,15 +338,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
327338if errors .Is (err ,context .Canceled )||
328339errors .Is (err ,yamux .ErrSessionShutdown )||
329340errors .Is (err ,fasthttputil .ErrInmemoryListenerClosed ) {
330- return
341+ return err
331342}
332343
333344p .opts .Logger .Warn (ctx ,"provisionerd was unable to acquire job" ,slog .Error (err ))
334- return
345+ return xerrors . Errorf ( "failed to acquire job: %w" , err )
335346}
336347if job .JobId == "" {
337348p .opts .Logger .Debug (ctx ,"acquire job successfully canceled" )
338- return
349+ return nil
339350}
340351
341352if len (job .TraceMetadata )> 0 {
@@ -390,9 +401,9 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
390401Error :fmt .Sprintf ("failed to connect to provisioner: %s" ,resp .Error ),
391402})
392403if err != nil {
393- p .opts .Logger .Error (ctx ,"provisioner job failed" ,slog .F ("job_id" ,job .JobId ),slog .Error (err ))
404+ p .opts .Logger .Error (ctx ,"failed to report provisioner job failed" ,slog .F ("job_id" ,job .JobId ),slog .Error (err ))
394405}
395- return
406+ return xerrors . Errorf ( "failed to report provisioner job failed: %w" , err )
396407}
397408
398409p .mutex .Lock ()
@@ -416,6 +427,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
416427p .mutex .Lock ()
417428p .activeJob = nil
418429p .mutex .Unlock ()
430+ return nil
419431}
420432
421433// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut