@@ -62,7 +62,10 @@ const (
62
62
63
63
// EnvProcPrioMgmt determines whether we attempt to manage
64
64
// process CPU and OOM Killer priority.
65
- const EnvProcPrioMgmt = "CODER_PROC_PRIO_MGMT"
65
+ const (
66
+ EnvProcPrioMgmt = "CODER_PROC_PRIO_MGMT"
67
+ EnvProcOOMScore = "CODER_PROC_OOM_SCORE"
68
+ )
66
69
67
70
type Options struct {
68
71
Filesystem afero.Fs
@@ -1575,10 +1578,31 @@ func (a *agent) manageProcessPriorityUntilGracefulShutdown() {
1575
1578
a .processManagementTick = ticker .C
1576
1579
}
1577
1580
1581
+ oomScore := unsetOOMScore
1582
+ if scoreStr ,ok := a .environmentVariables [EnvProcOOMScore ];ok {
1583
+ score ,err := strconv .Atoi (strings .TrimSpace (scoreStr ))
1584
+ if err == nil && score >= - 1000 && score <= 1000 {
1585
+ oomScore = score
1586
+ }else {
1587
+ a .logger .Error (ctx ,"invalid oom score" ,
1588
+ slog .F ("min_value" ,- 1000 ),
1589
+ slog .F ("max_value" ,1000 ),
1590
+ slog .F ("value" ,scoreStr ),
1591
+ )
1592
+ }
1593
+ }
1594
+
1595
+ debouncer := & logDebouncer {
1596
+ logger :a .logger ,
1597
+ messages :map [string ]time.Time {},
1598
+ interval :time .Minute ,
1599
+ }
1600
+
1578
1601
for {
1579
- procs ,err := a .manageProcessPriority (ctx )
1602
+ procs ,err := a .manageProcessPriority (ctx ,debouncer ,oomScore )
1603
+ // Avoid spamming the logs too often.
1580
1604
if err != nil {
1581
- a . logger .Error (ctx ,"manage process priority" ,
1605
+ debouncer .Error (ctx ,"manage process priority" ,
1582
1606
slog .Error (err ),
1583
1607
)
1584
1608
}
@@ -1594,42 +1618,51 @@ func (a *agent) manageProcessPriorityUntilGracefulShutdown() {
1594
1618
}
1595
1619
}
1596
1620
1597
- func (a * agent )manageProcessPriority (ctx context.Context ) ([]* agentproc.Process ,error ) {
1621
+ // unsetOOMScore is set to an invalid OOM score to imply an unset value.
1622
+ const unsetOOMScore = 1001
1623
+
1624
+ func (a * agent )manageProcessPriority (ctx context.Context ,debouncer * logDebouncer ,oomScore int ) ([]* agentproc.Process ,error ) {
1598
1625
const (
1599
1626
niceness = 10
1600
1627
)
1601
1628
1629
+ // We fetch the agent score each time because it's possible someone updates the
1630
+ // value after it is started.
1631
+ agentScore ,err := a .getAgentOOMScore ()
1632
+ if err != nil {
1633
+ agentScore = unsetOOMScore
1634
+ }
1635
+ if oomScore == unsetOOMScore && agentScore != unsetOOMScore {
1636
+ // If the child score has not been explicitly specified we should
1637
+ // set it to a score relative to the agent score.
1638
+ oomScore = childOOMScore (agentScore )
1639
+ }
1640
+
1602
1641
procs ,err := agentproc .List (a .filesystem ,a .syscaller )
1603
1642
if err != nil {
1604
1643
return nil ,xerrors .Errorf ("list: %w" ,err )
1605
1644
}
1606
1645
1607
- var (
1608
- modProcs = []* agentproc.Process {}
1609
- logger slog.Logger
1610
- )
1646
+ modProcs := []* agentproc.Process {}
1611
1647
1612
1648
for _ ,proc := range procs {
1613
- logger = a .logger .With (
1614
- slog .F ("cmd" ,proc .Cmd ()),
1615
- slog .F ("pid" ,proc .PID ),
1616
- )
1617
-
1618
1649
containsFn := func (e string )bool {
1619
1650
contains := strings .Contains (proc .Cmd (),e )
1620
1651
return contains
1621
1652
}
1622
1653
1623
1654
// If the process is prioritized we should adjust
1624
1655
// it's oom_score_adj and avoid lowering its niceness.
1625
- if slices .ContainsFunc [[] string , string ] (prioritizedProcs ,containsFn ) {
1656
+ if slices .ContainsFunc (prioritizedProcs ,containsFn ) {
1626
1657
continue
1627
1658
}
1628
1659
1629
- score ,err := proc .Niceness (a .syscaller )
1630
- if err != nil {
1631
- logger .Warn (ctx ,"unable to get proc niceness" ,
1632
- slog .Error (err ),
1660
+ score ,niceErr := proc .Niceness (a .syscaller )
1661
+ if niceErr != nil && ! xerrors .Is (niceErr ,os .ErrPermission ) {
1662
+ debouncer .Warn (ctx ,"unable to get proc niceness" ,
1663
+ slog .F ("cmd" ,proc .Cmd ()),
1664
+ slog .F ("pid" ,proc .PID ),
1665
+ slog .Error (niceErr ),
1633
1666
)
1634
1667
continue
1635
1668
}
@@ -1643,15 +1676,31 @@ func (a *agent) manageProcessPriority(ctx context.Context) ([]*agentproc.Process
1643
1676
continue
1644
1677
}
1645
1678
1646
- err = proc .SetNiceness (a .syscaller ,niceness )
1647
- if err != nil {
1648
- logger .Warn (ctx ,"unable to set proc niceness" ,
1649
- slog .F ("niceness" ,niceness ),
1650
- slog .Error (err ),
1651
- )
1652
- continue
1679
+ if niceErr == nil {
1680
+ err := proc .SetNiceness (a .syscaller ,niceness )
1681
+ if err != nil && ! xerrors .Is (err ,os .ErrPermission ) {
1682
+ debouncer .Warn (ctx ,"unable to set proc niceness" ,
1683
+ slog .F ("cmd" ,proc .Cmd ()),
1684
+ slog .F ("pid" ,proc .PID ),
1685
+ slog .F ("niceness" ,niceness ),
1686
+ slog .Error (err ),
1687
+ )
1688
+ }
1653
1689
}
1654
1690
1691
+ // If the oom score is valid and it's not already set and isn't a custom value set by another process then it's ok to update it.
1692
+ if oomScore != unsetOOMScore && oomScore != proc .OOMScoreAdj && ! isCustomOOMScore (agentScore ,proc ) {
1693
+ oomScoreStr := strconv .Itoa (oomScore )
1694
+ err := afero .WriteFile (a .filesystem ,fmt .Sprintf ("/proc/%d/oom_score_adj" ,proc .PID ), []byte (oomScoreStr ),0o644 )
1695
+ if err != nil && ! xerrors .Is (err ,os .ErrPermission ) {
1696
+ debouncer .Warn (ctx ,"unable to set oom_score_adj" ,
1697
+ slog .F ("cmd" ,proc .Cmd ()),
1698
+ slog .F ("pid" ,proc .PID ),
1699
+ slog .F ("score" ,oomScoreStr ),
1700
+ slog .Error (err ),
1701
+ )
1702
+ }
1703
+ }
1655
1704
modProcs = append (modProcs ,proc )
1656
1705
}
1657
1706
return modProcs ,nil
@@ -2005,3 +2054,77 @@ func PrometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger sl
2005
2054
}
2006
2055
})
2007
2056
}
2057
+
2058
+ // childOOMScore returns the oom_score_adj for a child process. It is based
2059
+ // on the oom_score_adj of the agent process.
2060
+ func childOOMScore (agentScore int )int {
2061
+ // If the agent has a negative oom_score_adj, we set the child to 0
2062
+ // so it's treated like every other process.
2063
+ if agentScore < 0 {
2064
+ return 0
2065
+ }
2066
+
2067
+ // If the agent is already almost at the maximum then set it to the max.
2068
+ if agentScore >= 998 {
2069
+ return 1000
2070
+ }
2071
+
2072
+ // If the agent oom_score_adj is >=0, we set the child to slightly
2073
+ // less than the maximum. If users want a different score they set it
2074
+ // directly.
2075
+ return 998
2076
+ }
2077
+
2078
+ func (a * agent )getAgentOOMScore () (int ,error ) {
2079
+ scoreStr ,err := afero .ReadFile (a .filesystem ,"/proc/self/oom_score_adj" )
2080
+ if err != nil {
2081
+ return 0 ,xerrors .Errorf ("read file: %w" ,err )
2082
+ }
2083
+
2084
+ score ,err := strconv .Atoi (strings .TrimSpace (string (scoreStr )))
2085
+ if err != nil {
2086
+ return 0 ,xerrors .Errorf ("parse int: %w" ,err )
2087
+ }
2088
+
2089
+ return score ,nil
2090
+ }
2091
+
2092
+ // isCustomOOMScore checks to see if the oom_score_adj is not a value that would
2093
+ // originate from an agent-spawned process.
2094
+ func isCustomOOMScore (agentScore int ,process * agentproc.Process )bool {
2095
+ score := process .OOMScoreAdj
2096
+ return agentScore != score && score != 1000 && score != 0 && score != 998
2097
+ }
2098
+
2099
+ // logDebouncer skips writing a log for a particular message if
2100
+ // it's been emitted within the given interval duration.
2101
+ // It's a shoddy implementation used in one spot that should be replaced at
2102
+ // some point.
2103
+ type logDebouncer struct {
2104
+ logger slog.Logger
2105
+ messages map [string ]time.Time
2106
+ interval time.Duration
2107
+ }
2108
+
2109
+ func (l * logDebouncer )Warn (ctx context.Context ,msg string ,fields ... any ) {
2110
+ l .log (ctx ,slog .LevelWarn ,msg ,fields ... )
2111
+ }
2112
+
2113
+ func (l * logDebouncer )Error (ctx context.Context ,msg string ,fields ... any ) {
2114
+ l .log (ctx ,slog .LevelError ,msg ,fields ... )
2115
+ }
2116
+
2117
+ func (l * logDebouncer )log (ctx context.Context ,level slog.Level ,msg string ,fields ... any ) {
2118
+ // This (bad) implementation assumes you wouldn't reuse the same msg
2119
+ // for different levels.
2120
+ if last ,ok := l .messages [msg ];ok && time .Since (last )< l .interval {
2121
+ return
2122
+ }
2123
+ switch level {
2124
+ case slog .LevelWarn :
2125
+ l .logger .Warn (ctx ,msg ,fields ... )
2126
+ case slog .LevelError :
2127
+ l .logger .Error (ctx ,msg ,fields ... )
2128
+ }
2129
+ l .messages [msg ]= time .Now ()
2130
+ }