@@ -34,7 +34,8 @@ type podEventLoggerOptions struct {
3434
3535logger slog.Logger
3636logDebounce time.Duration
37- maxRetries int
37+ // maxRetries is the maximum number of retries for a log send failure.
38+ maxRetries int
3839
3940// The following fields are optional!
4041namespaces []string
@@ -414,7 +415,9 @@ type logQueuer struct {
414415loggers map [string ]agentLoggerLifecycle
415416logCache logCache
416417
417- retries map [string ]* retryState
418+ // retries maps agent tokens to their retry state for exponential backoff
419+ retries map [string ]* retryState
420+ // maxRetries is the maximum number of retries for a log send failure.
418421maxRetries int
419422}
420423
@@ -436,7 +439,7 @@ func (l *logQueuer) work(ctx context.Context) {
436439}
437440}
438441
439- func (l * logQueuer )newLogger (ctx context.Context ,log agentLog , queuedLogs []agentsdk. Log ) (agentLoggerLifecycle ,error ) {
442+ func (l * logQueuer )newLogger (ctx context.Context ,log agentLog ) (agentLoggerLifecycle ,error ) {
440443client := agentsdk .New (l .coderURL )
441444client .SetSessionToken (log .agentToken )
442445logger := l .logger .With (slog .F ("resource_name" ,log .resourceName ))
@@ -448,10 +451,9 @@ func (l *logQueuer) newLogger(ctx context.Context, log agentLog, queuedLogs []ag
448451DisplayName :"Kubernetes" ,
449452})
450453if err != nil {
451- //This shouldn't fail sending the log, as it only affects howthey
452- //appear .
454+ //Posting the log source failed, which affects howlogs appear.
455+ //We'll retry to ensure the log source is properly registered .
453456logger .Error (ctx ,"post log source" ,slog .Error (err ))
454- l .scheduleRetry (ctx ,log .agentToken )
455457return agentLoggerLifecycle {},err
456458}
457459
@@ -466,7 +468,6 @@ func (l *logQueuer) newLogger(ctx context.Context, log agentLog, queuedLogs []ag
466468if err != nil {
467469logger .Error (ctx ,"drpc connect" ,slog .Error (err ))
468470gracefulCancel ()
469- l .scheduleRetry (ctx ,log .agentToken )
470471return agentLoggerLifecycle {},err
471472}
472473go func () {
@@ -485,6 +486,8 @@ func (l *logQueuer) newLogger(ctx context.Context, log agentLog, queuedLogs []ag
485486lifecycle := agentLoggerLifecycle {
486487scriptLogger :sl ,
487488close :func () {
489+ defer arpc .DRPCConn ().Close ()
490+ defer client .SDK .HTTPClient .CloseIdleConnections ()
488491// We could be stopping for reasons other than the timeout. If
489492// so, stop the timer.
490493closeTimer .Stop ()
@@ -503,9 +506,6 @@ func (l *logQueuer) newLogger(ctx context.Context, log agentLog, queuedLogs []ag
503506// ctx err
504507logger .Warn (gracefulCtx ,"timeout reached while waiting for log queue to empty" )
505508}
506-
507- _ = arpc .DRPCConn ().Close ()
508- client .SDK .HTTPClient .CloseIdleConnections ()
509509},
510510}
511511lifecycle .closeTimer = closeTimer
@@ -533,7 +533,7 @@ func (l *logQueuer) processLog(ctx context.Context, log agentLog) {
533533}
534534
535535var err error
536- lgr ,err = l .newLogger (ctx ,log , queuedLogs )
536+ lgr ,err = l .newLogger (ctx ,log )
537537if err != nil {
538538l .scheduleRetry (ctx ,log .agentToken )
539539return
@@ -549,7 +549,7 @@ func (l *logQueuer) processLog(ctx context.Context, log agentLog) {
549549l .scheduleRetry (ctx ,log .agentToken )
550550return
551551}
552- l .clearRetry (log .agentToken )
552+ l .clearRetryLocked (log .agentToken )
553553l .logCache .delete (log .agentToken )
554554}
555555
@@ -558,9 +558,8 @@ func (l *logQueuer) processDelete(log agentLog) {
558558lgr ,ok := l .loggers [log .agentToken ]
559559if ok {
560560delete (l .loggers ,log .agentToken )
561-
562561}
563- l .clearRetry (log .agentToken )
562+ l .clearRetryLocked (log .agentToken )
564563l .logCache .delete (log .agentToken )
565564l .mu .Unlock ()
566565
@@ -598,6 +597,7 @@ type retryState struct {
598597delay time.Duration
599598timer * quartz.Timer
600599retryCount int
600+ exhausted bool // prevent retry state recreation after max retries
601601}
602602
603603func (l * logQueuer )scheduleRetry (ctx context.Context ,token string ) {
@@ -606,8 +606,13 @@ func (l *logQueuer) scheduleRetry(ctx context.Context, token string) {
606606}
607607
608608rs := l .retries [token ]
609+
610+ if rs != nil && rs .exhausted {
611+ return
612+ }
613+
609614if rs == nil {
610- rs = & retryState {delay :time .Second }
615+ rs = & retryState {delay :time .Second , retryCount : 0 , exhausted : false }
611616l .retries [token ]= rs
612617}
613618
@@ -618,7 +623,11 @@ func (l *logQueuer) scheduleRetry(ctx context.Context, token string) {
618623l .logger .Error (ctx ,"max retries exceeded" ,
619624slog .F ("retryCount" ,rs .retryCount ),
620625slog .F ("maxRetries" ,l .maxRetries ))
621- l .clearRetry (token )
626+ rs .exhausted = true
627+ if rs .timer != nil {
628+ rs .timer .Stop ()
629+ rs .timer = nil
630+ }
622631l .logCache .delete (token )
623632return
624633}
@@ -627,24 +636,18 @@ func (l *logQueuer) scheduleRetry(ctx context.Context, token string) {
627636return
628637}
629638
630- if rs .delay < time .Second {
631- rs .delay = time .Second
632- }else if rs .delay > 30 * time .Second {
633- rs .delay = 30 * time .Second
634- }
635-
636639l .logger .Info (ctx ,"scheduling retry" ,
637640slog .F ("delay" ,rs .delay .String ()),
638641slog .F ("retryCount" ,rs .retryCount ))
639642
640643rs .timer = l .clock .AfterFunc (rs .delay ,func () {
641644l .mu .Lock ()
642- if cur := l .retries [token ];cur != nil {
645+ defer l .mu .Unlock ()
646+
647+ if cur := l .retries [token ];cur != nil && ! cur .exhausted {
643648cur .timer = nil
649+ l .q <- agentLog {op :opLog ,agentToken :token }
644650}
645- l .mu .Unlock ()
646-
647- l .q <- agentLog {op :opLog ,agentToken :token }
648651})
649652
650653rs .delay *= 2
@@ -653,7 +656,9 @@ func (l *logQueuer) scheduleRetry(ctx context.Context, token string) {
653656}
654657}
655658
656- func (l * logQueuer )clearRetry (token string ) {
659+ // clearRetryLocked clears the retry state for the given token.
660+ // The caller must hold the mutex lock.
661+ func (l * logQueuer )clearRetryLocked (token string ) {
657662if rs := l .retries [token ];rs != nil {
658663if rs .timer != nil {
659664rs .timer .Stop ()