- Notifications
You must be signed in to change notification settings - Fork929
feat: make agent stats' cardinality configurable#12468
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes from12 commits
1172e09
ddd563e
6a1ab6e
ce0c22d
25fd616
cc1a0b0
122f68d
3e569ff
62e2624
6544d2d
5e89d05
ae8a912
023f7d4
9aedd97
92be1d6
9b16a3b
6c7d1bd
3538e78
5a97817
c861500
6bcfe99
765fe9d
37c3628
f1d2821
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
package agentmetrics | ||
const ( | ||
TemplateNameLabel = "template_name" | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
AgentNameLabel = "agent_name" | ||
UsernameLabel = "username" | ||
WorkspaceNameLabel = "workspace_name" | ||
) |
Some generated files are not rendered by default. Learn more abouthow customized files appear on GitHub.
Uh oh!
There was an error while loading.Please reload this page.
Some generated files are not rendered by default. Learn more abouthow customized files appear on GitHub.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -8,8 +8,11 @@ import ( | ||
"time" | ||
"github.com/prometheus/client_golang/prometheus" | ||
"github.com/prometheus/common/model" | ||
"golang.org/x/xerrors" | ||
"github.com/coder/coder/v2/coderd/agentmetrics" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Unfortunately our formatter doesn't handle merging import groups and leaves things in a messy state (depending on what program injected them). 😔 If you notice these, please feel free to fix, but the standard is we try our best but sometimes these slip through, so don't worry too much. | ||
"cdr.dev/slog" | ||
agentproto "github.com/coder/coder/v2/agent/proto" | ||
@@ -43,9 +46,10 @@ type MetricsAggregator struct { | ||
collectCh chan (chan []prometheus.Metric) | ||
updateCh chan updateRequest | ||
storeSizeGauge prometheus.Gauge | ||
updateHistogram prometheus.Histogram | ||
cleanupHistogram prometheus.Histogram | ||
aggregateByLabels []string | ||
} | ||
type updateRequest struct { | ||
@@ -68,6 +72,8 @@ type annotatedMetric struct { | ||
templateName string | ||
expiryDate time.Time | ||
aggregateByLabels []string | ||
} | ||
type metricKey struct { | ||
@@ -102,13 +108,28 @@ func hashKey(req *updateRequest, m *agentproto.Stats_Metric) metricKey { | ||
var _ prometheus.Collector = new(MetricsAggregator) | ||
func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { | ||
var ( | ||
baseLabelNames = am.aggregateByLabels | ||
baseLabelValues []string | ||
extraLabels = am.Labels | ||
) | ||
for _, label := range am.aggregateByLabels { | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
val, err := am.getFieldByLabel(label) | ||
if err != nil { | ||
return nil, err | ||
} | ||
baseLabelValues = append(baseLabelValues, val) | ||
} | ||
labels := make([]string, 0, len(baseLabelNames)+len(extraLabels)) | ||
labelValues := make([]string, 0, len(baseLabelNames)+len(extraLabels)) | ||
labels = append(labels,baseLabelNames...) | ||
labelValues = append(labelValues,baseLabelValues...) | ||
for _, l := rangeextraLabels { | ||
labels = append(labels, l.Name) | ||
labelValues = append(labelValues, l.Value) | ||
} | ||
@@ -118,10 +139,48 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { | ||
if err != nil { | ||
return nil, err | ||
} | ||
return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil | ||
} | ||
// getFieldByLabel returns the related field value for a given label | ||
func (am *annotatedMetric) getFieldByLabel(label string) (string, error) { | ||
var labelVal string | ||
switch label { | ||
case agentmetrics.WorkspaceNameLabel: | ||
labelVal = am.workspaceName | ||
case agentmetrics.TemplateNameLabel: | ||
labelVal = am.templateName | ||
case agentmetrics.AgentNameLabel: | ||
labelVal = am.agentName | ||
case agentmetrics.UsernameLabel: | ||
labelVal = am.username | ||
default: | ||
return "", xerrors.Errorf("unexpected label: %q", label) | ||
} | ||
return labelVal, nil | ||
} | ||
func (am *annotatedMetric) clone() annotatedMetric { | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
stats := &agentproto.Stats_Metric{ | ||
Name: am.Name, | ||
Type: am.Type, | ||
Value: am.Value, | ||
Labels: am.Labels, | ||
} | ||
return annotatedMetric{ | ||
Stats_Metric: stats, | ||
username: am.username, | ||
workspaceName: am.workspaceName, | ||
agentName: am.agentName, | ||
templateName: am.templateName, | ||
expiryDate: am.expiryDate, | ||
} | ||
} | ||
func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, duration time.Duration, aggregateByLabels []string) (*MetricsAggregator, error) { | ||
metricsCleanupInterval := defaultMetricsCleanupInterval | ||
if duration > 0 { | ||
metricsCleanupInterval = duration | ||
@@ -174,9 +233,66 @@ func NewMetricsAggregator(logger slog.Logger, registerer prometheus.Registerer, | ||
storeSizeGauge: storeSizeGauge, | ||
updateHistogram: updateHistogram, | ||
cleanupHistogram: cleanupHistogram, | ||
aggregateByLabels: aggregateByLabels, | ||
}, nil | ||
} | ||
// labelAggregator is used to control cardinality of collected Prometheus metrics by pre-aggregating series based on given labels. | ||
type labelAggregator struct { | ||
aggregations map[string]float64 | ||
metrics map[string]annotatedMetric | ||
} | ||
func newLabelAggregator(size int) *labelAggregator { | ||
return &labelAggregator{ | ||
aggregations: make(map[string]float64, size), | ||
metrics: make(map[string]annotatedMetric, size), | ||
} | ||
} | ||
func (a *labelAggregator) aggregate(am annotatedMetric, labels []string) error { | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
// Use a LabelSet because it can give deterministic fingerprints of label combinations regardless of map ordering. | ||
labelSet := make(model.LabelSet, len(labels)) | ||
for _, label := range labels { | ||
val, err := am.getFieldByLabel(label) | ||
if err != nil { | ||
return err | ||
} | ||
labelSet[model.LabelName(label)] = model.LabelValue(val) | ||
} | ||
// Memoize based on the metric name & the unique combination of labels. | ||
key := fmt.Sprintf("%s:%v", am.Stats_Metric.Name, labelSet.FastFingerprint()) | ||
// Aggregate the value based on the key. | ||
a.aggregations[key] += am.Value | ||
metric, found := a.metrics[key] | ||
if !found { | ||
// Take a copy of the given annotatedMetric because it may be manipulated later and contains pointers. | ||
metric = am.clone() | ||
} | ||
// Store the metric. | ||
metric.aggregateByLabels = labels | ||
metric.Value = a.aggregations[key] | ||
a.metrics[key] = metric | ||
return nil | ||
} | ||
func (a *labelAggregator) listMetrics() []annotatedMetric { | ||
var out []annotatedMetric | ||
for _, am := range a.metrics { | ||
out = append(out, am) | ||
} | ||
return out | ||
} | ||
func (ma *MetricsAggregator) Run(ctx context.Context) func() { | ||
ctx, cancelFunc := context.WithCancel(ctx) | ||
done := make(chan struct{}) | ||
@@ -216,15 +332,41 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { | ||
case outputCh := <-ma.collectCh: | ||
ma.log.Debug(ctx, "collect metrics") | ||
var input []annotatedMetric | ||
output := make([]prometheus.Metric, 0, len(ma.store)) | ||
// If custom aggregation labels have not been chosen, generate Prometheus metrics without any pre-aggregation. | ||
// This results in higher cardinality, but may be desirable in larger deployments. | ||
// Default behavior. | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
if len(ma.aggregateByLabels) == 0 { | ||
for _, m := range ma.store { | ||
// Aggregate by all available metrics. | ||
m.aggregateByLabels = defaultAgentMetricsLabels | ||
input = append(input, m) | ||
} | ||
} else { | ||
// However, if custom aggregations have been chosen, we need to aggregate the values from the annotated | ||
// metrics because we cannot register multiple metric series with the same labels. | ||
la := newLabelAggregator(len(ma.store)) | ||
for _, m := range ma.store { | ||
if err := la.aggregate(m, ma.aggregateByLabels); err != nil { | ||
ma.log.Error(ctx, "can't aggregate labels", slog.F("labels", strings.Join(ma.aggregateByLabels, ",")), slog.Error(err)) | ||
} | ||
} | ||
input = la.listMetrics() | ||
} | ||
for _, m := range input { | ||
promMetric, err := m.asPrometheus() | ||
if err != nil { | ||
ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err)) | ||
continue | ||
} | ||
output = append(output, promMetric) | ||
} | ||
outputCh <- output | ||
close(outputCh) | ||
case <-cleanupTicker.C: | ||
@@ -260,7 +402,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { | ||
func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) { | ||
} | ||
vardefaultAgentMetricsLabels = []string{agentmetrics.UsernameLabel, agentmetrics.WorkspaceNameLabel, agentmetrics.AgentNameLabel, agentmetrics.TemplateNameLabel} | ||
// AgentMetricLabels are the labels used to decorate an agent's metrics. | ||
// This list should match the list of labels in agentMetricsLabels. | ||
Uh oh!
There was an error while loading.Please reload this page.