- Notifications
You must be signed in to change notification settings - Fork906
feat: fetch prebuilds metrics state in background#17792
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -2,14 +2,17 @@ package prebuilds | ||
import ( | ||
"context" | ||
"fmt" | ||
"sync/atomic" | ||
"time" | ||
"github.com/prometheus/client_golang/prometheus" | ||
"golang.org/x/xerrors" | ||
"cdr.dev/slog" | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
"github.com/coder/coder/v2/coderd/database" | ||
"github.com/coder/coder/v2/coderd/database/dbtime" | ||
"github.com/coder/coder/v2/coderd/prebuilds" | ||
) | ||
@@ -55,20 +58,34 @@ var ( | ||
labels, | ||
nil, | ||
) | ||
lastUpdateDesc = prometheus.NewDesc( | ||
"coderd_prebuilt_workspaces_metrics_last_updated", | ||
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Is unix timestamp easy to alert on? Like can you do something like Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. +1 from me for duration since last successful fetch There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. The idiomatic approach is to use unix timestamps, see There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. So I guess we have an existing metric for the coder server start timestamp? ContributorAuthor
| ||
[]string{}, | ||
nil, | ||
) | ||
) | ||
const ( | ||
metricsUpdateInterval = time.Second * 15 | ||
metricsUpdateTimeout = time.Second * 10 | ||
) | ||
type MetricsCollector struct { | ||
database database.Store | ||
logger slog.Logger | ||
snapshotter prebuilds.StateSnapshotter | ||
latestState atomic.Pointer[metricsState] | ||
} | ||
var _ prometheus.Collector = new(MetricsCollector) | ||
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { | ||
log := logger.Named("prebuilds_metrics_collector") | ||
return &MetricsCollector{ | ||
database: db, | ||
logger:log, | ||
snapshotter: snapshotter, | ||
} | ||
} | ||
@@ -80,38 +97,34 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { | ||
descCh <- desiredPrebuildsDesc | ||
descCh <- runningPrebuildsDesc | ||
descCh <- eligiblePrebuildsDesc | ||
descCh <- lastUpdateDesc | ||
} | ||
// Collect uses the cached state to set configured metrics. | ||
// The state is cached because this function can be called multiple times per second and retrieving the current state | ||
// is an expensive operation. | ||
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { | ||
currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func. | ||
if currentState == nil { | ||
mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set") | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, 0) | ||
return | ||
} | ||
for _, metric := rangecurrentState.prebuildMetrics { | ||
metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) | ||
metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) | ||
metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) | ||
} | ||
for _, preset := range currentState.snapshot.Presets { | ||
if !preset.UsingActiveVersion { | ||
continue | ||
} | ||
presetSnapshot, err :=currentState.snapshot.FilterByPreset(preset.ID) | ||
if err != nil { | ||
mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err)) | ||
continue | ||
} | ||
state := presetSnapshot.CalculateState() | ||
@@ -120,4 +133,57 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { | ||
metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName) | ||
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) | ||
} | ||
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix())) | ||
} | ||
type metricsState struct { | ||
prebuildMetrics []database.GetPrebuildMetricsRow | ||
snapshot *prebuilds.GlobalSnapshot | ||
createdAt time.Time | ||
} | ||
// BackgroundFetch updates the metrics state every given interval. | ||
func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, updateTimeout time.Duration) { | ||
tick := time.NewTicker(time.Nanosecond) | ||
defer tick.Stop() | ||
for { | ||
select { | ||
case <-ctx.Done(): | ||
return | ||
case <-tick.C: | ||
// Tick immediately, then set regular interval. | ||
tick.Reset(updateInterval) | ||
if err := mc.UpdateState(ctx, updateTimeout); err != nil { | ||
mc.logger.Error(ctx, "failed to update prebuilds metrics state", slog.Error(err)) | ||
} | ||
} | ||
} | ||
} | ||
// UpdateState builds the current metrics state. | ||
func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error { | ||
start := time.Now() | ||
fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout) | ||
defer fetchCancel() | ||
prebuildMetrics, err := mc.database.GetPrebuildMetrics(fetchCtx) | ||
if err != nil { | ||
return xerrors.Errorf("fetch prebuild metrics: %w", err) | ||
} | ||
snapshot, err := mc.snapshotter.SnapshotState(fetchCtx, mc.database) | ||
if err != nil { | ||
return xerrors.Errorf("snapshot state: %w", err) | ||
} | ||
mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds()))) | ||
mc.latestState.Store(&metricsState{ | ||
prebuildMetrics: prebuildMetrics, | ||
snapshot: snapshot, | ||
createdAt: dbtime.Now(), | ||
}) | ||
return nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -5,6 +5,7 @@ import ( | ||
"database/sql" | ||
"fmt" | ||
"math" | ||
"sync" | ||
"sync/atomic" | ||
"time" | ||
@@ -67,10 +68,12 @@ func NewStoreReconciler(store database.Store, | ||
provisionNotifyCh: make(chan database.ProvisionerJob, 10), | ||
} | ||
if registerer != nil { | ||
reconciler.metrics = NewMetricsCollector(store, logger, reconciler) | ||
if err := registerer.Register(reconciler.metrics); err != nil { | ||
// If the registerer fails to register the metrics collector, it's not fatal. | ||
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err)) | ||
} | ||
} | ||
return reconciler | ||
@@ -87,16 +90,27 @@ func (c *StoreReconciler) Run(ctx context.Context) { | ||
slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()), | ||
slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String())) | ||
var wg sync.WaitGroup | ||
ticker := c.clock.NewTicker(reconciliationInterval) | ||
defer ticker.Stop() | ||
defer func() { | ||
wg.Wait() | ||
c.done <- struct{}{} | ||
}() | ||
// nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions. | ||
ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx)) | ||
c.cancelFn = cancel | ||
// Start updating metrics in the background. | ||
if c.metrics != nil { | ||
dannykopping marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
wg.Add(1) | ||
go func() { | ||
defer wg.Done() | ||
c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout) | ||
}() | ||
} | ||
// Everything is in place, reconciler can now be considered as running. | ||
// | ||
// NOTE: without this atomic bool, Stop might race with Run for the c.cancelFn above. | ||
Uh oh!
There was an error while loading.Please reload this page.