Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

feat: fetch prebuilds metrics state in background#17792

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
dannykopping merged 4 commits intomainfromdk/debounce-metrics-collection
May 13, 2025
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 86 additions & 20 deletionsenterprise/coderd/prebuilds/metricscollector.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -2,14 +2,17 @@ package prebuilds

import (
"context"
"fmt"
"sync/atomic"
"time"

"cdr.dev/slog"

"github.com/prometheus/client_golang/prometheus"
"golang.org/x/xerrors"

"cdr.dev/slog"

"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/prebuilds"
)

Expand DownExpand Up@@ -55,20 +58,34 @@ var (
labels,
nil,
)
lastUpdateDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_metrics_last_updated",
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Is unix timestamp easy to alert on? Like can you do something likeunix_now() - metric_value > 1000 or something in grafana and co? If not, it might be better if this was a durationsince the last successful fetch instead.

Copy link
Member

@johnstcnjohnstcnMay 13, 2025
edited
Loading

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

+1 from me for duration since last successful fetch

Copy link
ContributorAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

The idiomatic approach is to use unix timestamps, seeprometheus_config_last_reload_success_timestamp_seconds.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

So I guess we have an existing metric for the coder server start timestamp?

Copy link
ContributorAuthor

@dannykoppingdannykoppingMay 13, 2025
edited
Loading

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

I don't think so (or at least not one we export), but I think as long as this metric is updated relative to itselfandup is taken into consideration, it should be useful.

[]string{},
nil,
)
)

const (
metricsUpdateInterval = time.Second * 15
metricsUpdateTimeout = time.Second * 10
)

type MetricsCollector struct {
database database.Store
logger slog.Logger
snapshotter prebuilds.StateSnapshotter

latestState atomic.Pointer[metricsState]
}

var _ prometheus.Collector = new(MetricsCollector)

func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
log := logger.Named("prebuilds_metrics_collector")
return &MetricsCollector{
database: db,
logger:logger.Named("prebuilds_metrics_collector"),
logger:log,
snapshotter: snapshotter,
}
}
Expand All@@ -80,38 +97,34 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- desiredPrebuildsDesc
descCh <- runningPrebuildsDesc
descCh <- eligiblePrebuildsDesc
descCh <- lastUpdateDesc
}

// Collect uses the cached state to set configured metrics.
// The state is cached because this function can be called multiple times per second and retrieving the current state
// is an expensive operation.
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
// nolint:gocritic // We need to set an authz context to read metrics from the db.
ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second)
defer cancel()
prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx)
if err != nil {
mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err))
currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func.
if currentState == nil {
mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set")
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, 0)
return
}

for _, metric := range prebuildMetrics {
for _, metric := rangecurrentState.prebuildMetrics {
metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
}

snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database)
if err != nil {
mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err))
return
}

for _, preset := range snapshot.Presets {
for _, preset := range currentState.snapshot.Presets {
if !preset.UsingActiveVersion {
continue
}

presetSnapshot, err := snapshot.FilterByPreset(preset.ID)
presetSnapshot, err :=currentState.snapshot.FilterByPreset(preset.ID)
if err != nil {
mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err))
mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err))
continue
}
state := presetSnapshot.CalculateState()
Expand All@@ -120,4 +133,57 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
}

metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
}

type metricsState struct {
prebuildMetrics []database.GetPrebuildMetricsRow
snapshot *prebuilds.GlobalSnapshot
createdAt time.Time
}

// BackgroundFetch updates the metrics state every given interval.
func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, updateTimeout time.Duration) {
tick := time.NewTicker(time.Nanosecond)
defer tick.Stop()

for {
select {
case <-ctx.Done():
return
case <-tick.C:
// Tick immediately, then set regular interval.
tick.Reset(updateInterval)

if err := mc.UpdateState(ctx, updateTimeout); err != nil {
mc.logger.Error(ctx, "failed to update prebuilds metrics state", slog.Error(err))
}
}
}
}

// UpdateState builds the current metrics state.
func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error {
start := time.Now()
fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout)
defer fetchCancel()

prebuildMetrics, err := mc.database.GetPrebuildMetrics(fetchCtx)
if err != nil {
return xerrors.Errorf("fetch prebuild metrics: %w", err)
}

snapshot, err := mc.snapshotter.SnapshotState(fetchCtx, mc.database)
if err != nil {
return xerrors.Errorf("snapshot state: %w", err)
}
mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds())))

mc.latestState.Store(&metricsState{
prebuildMetrics: prebuildMetrics,
snapshot: snapshot,
createdAt: dbtime.Now(),
})
return nil
}
5 changes: 5 additions & 0 deletionsenterprise/coderd/prebuilds/metricscollector_test.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -16,6 +16,7 @@ import (
"github.com/coder/quartz"

"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds"
Expand DownExpand Up@@ -248,6 +249,10 @@ func TestMetricsCollector(t *testing.T) {
setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible)
}

// Force an update to the metrics state to allow the collector to collect fresh metrics.
// nolint:gocritic // Authz context needed to retrieve state.
require.NoError(t, collector.UpdateState(dbauthz.AsPrebuildsOrchestrator(ctx), testutil.WaitLong))

metricsFamilies, err := registry.Gather()
require.NoError(t, err)

Expand Down
22 changes: 18 additions & 4 deletionsenterprise/coderd/prebuilds/reconcile.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -5,6 +5,7 @@ import (
"database/sql"
"fmt"
"math"
"sync"
"sync/atomic"
"time"

Expand DownExpand Up@@ -67,10 +68,12 @@ func NewStoreReconciler(store database.Store,
provisionNotifyCh: make(chan database.ProvisionerJob, 10),
}

reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
if err := registerer.Register(reconciler.metrics); err != nil {
// If the registerer fails to register the metrics collector, it's not fatal.
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
if registerer != nil {
reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
if err := registerer.Register(reconciler.metrics); err != nil {
// If the registerer fails to register the metrics collector, it's not fatal.
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
}
}

return reconciler
Expand All@@ -87,16 +90,27 @@ func (c *StoreReconciler) Run(ctx context.Context) {
slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()),
slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String()))

var wg sync.WaitGroup
ticker := c.clock.NewTicker(reconciliationInterval)
defer ticker.Stop()
defer func() {
wg.Wait()
c.done <- struct{}{}
}()

// nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions.
ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx))
c.cancelFn = cancel

// Start updating metrics in the background.
if c.metrics != nil {
wg.Add(1)
go func() {
defer wg.Done()
c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout)
}()
}

// Everything is in place, reconciler can now be considered as running.
//
// NOTE: without this atomic bool, Stop might race with Run for the c.cancelFn above.
Expand Down
Loading

[8]ページ先頭

©2009-2025 Movatter.jp