- Notifications
You must be signed in to change notification settings - Fork928
feat: expose workspace statuses (with details) as a prometheus metric#12762
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
843d650
2ed42a3
c31b498
2f5a948
fc61d37
6f95371
f42af07
2cb8ccc
b118044
acd104c
a333f98
c920508
cf14b9d
a94914f
8e6cde9
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -973,26 +973,20 @@ func TestServer(t *testing.T) { | ||
scanner := bufio.NewScanner(res.Body) | ||
hasActiveUsers := false | ||
for scanner.Scan() { | ||
// This metric is manually registered to be tracked in the server. That's | ||
// why we test it's tracked here. | ||
if strings.HasPrefix(scanner.Text(), "coderd_api_active_users_duration_hour") { | ||
hasActiveUsers = true | ||
continue | ||
} | ||
Comment on lines -984 to -987 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Side-effect of clearing the gauge when no db rows are loaded. | ||
if strings.HasPrefix(scanner.Text(), "coderd_db_query_latencies_seconds") { | ||
t.Fatal("db metrics should not be tracked when --prometheus-collect-db-metrics is not enabled") | ||
} | ||
t.Logf("scanned %s", scanner.Text()) | ||
} | ||
require.NoError(t, scanner.Err()) | ||
require.True(t, hasActiveUsers) | ||
}) | ||
t.Run("DBMetricsEnabled", func(t *testing.T) { | ||
Some generated files are not rendered by default. Learn more abouthow customized files appear on GitHub.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -96,7 +96,8 @@ SELECT | ||
latest_build.completed_at as latest_build_completed_at, | ||
latest_build.canceled_at as latest_build_canceled_at, | ||
latest_build.error as latest_build_error, | ||
latest_build.transition as latest_build_transition, | ||
latest_build.job_status as latest_build_status | ||
FROM | ||
workspaces | ||
JOIN | ||
@@ -118,7 +119,7 @@ LEFT JOIN LATERAL ( | ||
provisioner_jobs.job_status | ||
FROM | ||
workspace_builds | ||
JOIN | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Check this with@mafredri if it was not intentional 👍 Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I would avoid making this change. I believe there's a window of time when there exists a new build but it's not assigned to a job. So the nullability is something that should be handled. This logic change essentially results in returning the previous build until the it's been assigned to a job. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I'm not sure if that's possible? createtableif not exists workspace_builds( id uuidnot null ... workspace_id uuidnot nullreferencespublic.workspaceson delete cascade, ... job_id uuidnot null uniquereferencespublic.provisioner_jobson delete cascade, ...); These keys are all non-nullable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I should say though that I'd rather not make this change if you in any way suspect this will cause issues. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Builds are inserted with their respective job in a single transaction. c.f. Also, Danny's right that job_id is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Given the constraint, the change is perfectly fine 👍. I should’ve checked it myself before commenting. Sorry for the noise@dannykopping. Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more.
Actually, where did you get the schema@dannykopping you referenced above? Doesn’t look the same as our dump which has NOT NULL. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. No worries@mafredri! I just generated the DDL from an existing database in my IDE (GoLand). The above also has There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Haha, it was perfectly hidden by the code block size 😂. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I know right??! Sorry about that lol | ||
provisioner_jobs | ||
ON | ||
provisioner_jobs.id = workspace_builds.job_id | ||
@@ -374,7 +375,8 @@ WHERE | ||
'0001-01-01 00:00:00+00'::timestamptz, -- latest_build_completed_at, | ||
'0001-01-01 00:00:00+00'::timestamptz, -- latest_build_canceled_at, | ||
'', -- latest_build_error | ||
'start'::workspace_transition, -- latest_build_transition | ||
'unknown'::provisioner_job_status -- latest_build_status | ||
WHERE | ||
@with_summary :: boolean = true | ||
), total_count AS ( | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -24,10 +24,12 @@ import ( | ||
"github.com/coder/coder/v2/tailnet" | ||
) | ||
const defaultRefreshRate = time.Minute | ||
// ActiveUsers tracks the number of users that have authenticated within the past hour. | ||
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { | ||
if duration == 0 { | ||
duration =defaultRefreshRate | ||
} | ||
gauge := prometheus.NewGauge(prometheus.GaugeOpts{ | ||
@@ -72,36 +74,42 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab | ||
} | ||
// Workspaces tracks the total number of workspaces with labels on status. | ||
func Workspaces(ctx context.Context,logger slog.Logger,registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { | ||
if duration == 0 { | ||
duration =defaultRefreshRate | ||
} | ||
workspaceLatestBuildTotals := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Namespace: "coderd", | ||
Subsystem: "api", | ||
Name: "workspace_latest_build_total", | ||
Help: "Thecurrent number ofworkspace buildsby status.", | ||
}, []string{"status"}) | ||
if err := registerer.Register(workspaceLatestBuildTotals); err != nil { | ||
return nil, err | ||
} | ||
workspaceLatestBuildStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Namespace: "coderd", | ||
Name: "workspace_latest_build_status", | ||
Help: "The current workspace statuses by template, transition, and owner.", | ||
}, []string{"status", "template_name", "template_version", "workspace_owner", "workspace_transition"}) | ||
if err := registerer.Register(workspaceLatestBuildStatuses); err != nil { | ||
return nil, err | ||
} | ||
ctx, cancelFunc := context.WithCancel(ctx) | ||
done := make(chan struct{}) | ||
updateWorkspaceTotals := func() { | ||
builds, err := db.GetLatestWorkspaceBuilds(ctx) | ||
if err != nil { | ||
if errors.Is(err, sql.ErrNoRows) { | ||
// clear all series if there are no database entries | ||
workspaceLatestBuildTotals.Reset() | ||
} | ||
logger.Warn(ctx, "failed to load latest workspace builds", slog.Error(err)) | ||
return | ||
} | ||
jobIDs := make([]uuid.UUID, 0, len(builds)) | ||
@@ -110,16 +118,53 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa | ||
} | ||
jobs, err := db.GetProvisionerJobsByIDs(ctx, jobIDs) | ||
if err != nil { | ||
ids := make([]string, 0, len(jobIDs)) | ||
for _, id := range jobIDs { | ||
ids = append(ids, id.String()) | ||
} | ||
logger.Warn(ctx, "failed to load provisioner jobs", slog.F("ids", ids), slog.Error(err)) | ||
return | ||
} | ||
workspaceLatestBuildTotals.Reset() | ||
for _, job := range jobs { | ||
status := codersdk.ProvisionerJobStatus(job.JobStatus) | ||
workspaceLatestBuildTotals.WithLabelValues(string(status)).Add(1) | ||
} | ||
} | ||
updateWorkspaceStatuses := func() { | ||
ws, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ | ||
Deleted: false, | ||
WithSummary: false, | ||
}) | ||
if err != nil { | ||
if errors.Is(err, sql.ErrNoRows) { | ||
// clear all series if there are no database entries | ||
workspaceLatestBuildStatuses.Reset() | ||
} | ||
logger.Warn(ctx, "failed to load active workspaces", slog.Error(err)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. nit: maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I wouldn't strictly call it an error because it could be an instance of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more.
Will it also return ContributorAuthor
| ||
return | ||
} | ||
workspaceLatestBuildStatuses.Reset() | ||
for _, w := range ws { | ||
workspaceLatestBuildStatuses.WithLabelValues(string(w.LatestBuildStatus), w.TemplateName, w.TemplateVersionName.String, w.Username, string(w.LatestBuildTransition)).Add(1) | ||
} | ||
} | ||
// Use time.Nanosecond to force an initial tick. It will be reset to the | ||
// correct duration after executing once. | ||
ticker := time.NewTicker(time.Nanosecond) | ||
doTick := func() { | ||
defer ticker.Reset(duration) | ||
updateWorkspaceTotals() | ||
updateWorkspaceStatuses() | ||
} | ||
go func() { | ||
defer close(done) | ||
defer ticker.Stop() | ||
@@ -141,7 +186,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa | ||
// Agents tracks the total number of workspaces with labels on status. | ||
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMapFn func() *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) { | ||
if duration == 0 { | ||
duration =defaultRefreshRate | ||
} | ||
agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
@@ -330,7 +375,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis | ||
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) { | ||
if duration == 0 { | ||
duration =defaultRefreshRate | ||
} | ||
if len(aggregateByLabels) == 0 { | ||
Uh oh!
There was an error while loading.Please reload this page.