Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

feat: expose workspace statuses (with details) as a prometheus metric#12762

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
dannykopping merged 15 commits intocoder:mainfromdannykopping:dk/workspace-metric
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
Show all changes
15 commits
Select commitHold shift + click to select a range
843d650
Add metric to show operators what statuses workspaces are in, with re…
dannykoppingMar 25, 2024
2ed42a3
Clear gauges if no db entries are found
dannykoppingMar 26, 2024
c31b498
Change default refresh rates of Agent and Workspaces calls which seem…
dannykoppingMar 26, 2024
2f5a948
Added tests for workspace detail metrics
dannykoppingMar 26, 2024
fc61d37
Refactor common funcs
dannykoppingMar 26, 2024
6f95371
Remove unnecessary metric from test
dannykoppingMar 26, 2024
f42af07
Use NoLock variety to fix hung tests
dannykoppingMar 26, 2024
2cb8ccc
Fix duration on windows builds
dannykoppingMar 26, 2024
b118044
Changing join type so that latest_build_status is non-nullable
dannykoppingMar 27, 2024
acd104c
Remove SuperShort duration as extraneous
dannykoppingMar 28, 2024
a333f98
Remove workspace cardinality; rename for clarity
dannykoppingMar 28, 2024
c920508
Fixing tests
dannykoppingMar 28, 2024
cf14b9d
Merge branch 'main' of github.com:/coder/coder into dk/workspace-metric
dannykoppingMar 28, 2024
a94914f
Merge branch 'main' of github.com:/coder/coder into dk/workspace-metric
dannykoppingMar 28, 2024
8e6cde9
Renaming for clarity
dannykoppingMar 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletioncli/server.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -209,7 +209,7 @@ func enablePrometheus(
}
afterCtx(ctx, closeUsersFunc)

closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.PrometheusRegistry, options.Database, 0)
closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.Logger.Named("workspaces_metrics"), options.PrometheusRegistry, options.Database, 0)
if err != nil {
return nil, xerrors.Errorf("register workspaces prometheus metric: %w", err)
}
Expand Down
6 changes: 0 additions & 6 deletionscli/server_test.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -973,26 +973,20 @@ func TestServer(t *testing.T) {

scanner := bufio.NewScanner(res.Body)
hasActiveUsers := false
hasWorkspaces := false
for scanner.Scan() {
// This metric is manually registered to be tracked in the server. That's
// why we test it's tracked here.
if strings.HasPrefix(scanner.Text(), "coderd_api_active_users_duration_hour") {
hasActiveUsers = true
continue
}
if strings.HasPrefix(scanner.Text(), "coderd_api_workspace_latest_build_total") {
hasWorkspaces = true
continue
}
Comment on lines -984 to -987
Copy link
ContributorAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Side-effect of clearing the gauge when no db rows are loaded.
This wasn't strictly necessary for the test since other manually-registered metrics are included to validate this test.

mtojek reacted with thumbs up emoji
if strings.HasPrefix(scanner.Text(), "coderd_db_query_latencies_seconds") {
t.Fatal("db metrics should not be tracked when --prometheus-collect-db-metrics is not enabled")
}
t.Logf("scanned %s", scanner.Text())
}
require.NoError(t, scanner.Err())
require.True(t, hasActiveUsers)
require.True(t, hasWorkspaces)
})

t.Run("DBMetricsEnabled", func(t *testing.T) {
Expand Down
10 changes: 10 additions & 0 deletionscoderd/database/dbmem/dbmem.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -404,6 +404,16 @@ func (q *FakeQuerier) convertToWorkspaceRowsNoLock(ctx context.Context, workspac
break
}
}

if pj, err := q.getProvisionerJobByIDNoLock(ctx, build.JobID); err == nil {
wr.LatestBuildStatus = pj.JobStatus
}

wr.LatestBuildTransition = build.Transition
}

if u, err := q.getUserByIDNoLock(w.OwnerID); err == nil {
wr.Username = u.Username
}

rows = append(rows, wr)
Expand Down
1 change: 1 addition & 0 deletionscoderd/database/modelqueries.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -266,6 +266,7 @@ func (q *sqlQuerier) GetAuthorizedWorkspaces(ctx context.Context, arg GetWorkspa
&i.LatestBuildCanceledAt,
&i.LatestBuildError,
&i.LatestBuildTransition,
&i.LatestBuildStatus,
&i.Count,
);err!=nil {
returnnil,err
Expand Down
64 changes: 34 additions & 30 deletionscoderd/database/queries.sql.go
View file
Open in desktop

Some generated files are not rendered by default. Learn more abouthow customized files appear on GitHub.

8 changes: 5 additions & 3 deletionscoderd/database/queries/workspaces.sql
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -96,7 +96,8 @@ SELECT
latest_build.completed_at as latest_build_completed_at,
latest_build.canceled_at as latest_build_canceled_at,
latest_build.error as latest_build_error,
latest_build.transition as latest_build_transition
latest_build.transition as latest_build_transition,
latest_build.job_status as latest_build_status
FROM
workspaces
JOIN
Expand All@@ -118,7 +119,7 @@ LEFT JOIN LATERAL (
provisioner_jobs.job_status
FROM
workspace_builds
LEFTJOIN
JOIN
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Check this with@mafredri if it was not intentional 👍

Copy link
Member

@mafredrimafredriMar 27, 2024
edited
Loading

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

I would avoid making this change. I believe there's a window of time when there exists a new build but it's not assigned to a job. So the nullability is something that should be handled.

This logic change essentially results in returning the previous build until the it's been assigned to a job.

Copy link
ContributorAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

I'm not sure if that's possible?

createtableif not exists workspace_builds(    id                  uuidnot null    ...    workspace_id        uuidnot nullreferencespublic.workspaceson delete cascade,    ...    job_id              uuidnot null        uniquereferencespublic.provisioner_jobson delete cascade,    ...);

These keys are all non-nullable.
Is there something I'm missing?

Copy link
ContributorAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

I should say though that I'd rather not make this change if you in any way suspect this will cause issues.
The way I had it prior to#12762 (comment) worked fine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Builds are inserted with their respective job in a single transaction. c.f.wsbuilder

Also, Danny's right that job_id isNOT NULL with a foreign key constraint, so even if we brokewsbuilder it still wouldn't be possible to have a build with no job.

mtojek reacted with eyes emoji
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Given the constraint, the change is perfectly fine 👍. I should’ve checked it myself before commenting. Sorry for the noise@dannykopping.

Copy link
Member

@mafredrimafredriMar 28, 2024
edited
Loading

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

The DB schema should probably be updated to include NOT NULL on the foreign keys, though. Otherwise NULL entries are not enforced by the DB.

Actually, where did you get the schema@dannykopping you referenced above? Doesn’t look the same as our dump which has NOT NULL.

Copy link
ContributorAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

No worries@mafredri!

I just generated the DDL from an existing database in my IDE (GoLand). The above also hasnot null but you have to scroll over a bit to the right.

mafredri reacted with heart emoji
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Haha, it was perfectly hidden by the code block size 😂.

Copy link
ContributorAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

I know right??! Sorry about that lol

provisioner_jobs
ON
provisioner_jobs.id = workspace_builds.job_id
Expand DownExpand Up@@ -374,7 +375,8 @@ WHERE
'0001-01-01 00:00:00+00'::timestamptz, -- latest_build_completed_at,
'0001-01-01 00:00:00+00'::timestamptz, -- latest_build_canceled_at,
'', -- latest_build_error
'start'::workspace_transition -- latest_build_transition
'start'::workspace_transition, -- latest_build_transition
'unknown'::provisioner_job_status -- latest_build_status
WHERE
@with_summary :: boolean = true
), total_count AS (
Expand Down
85 changes: 65 additions & 20 deletionscoderd/prometheusmetrics/prometheusmetrics.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -24,10 +24,12 @@ import (
"github.com/coder/coder/v2/tailnet"
)

const defaultRefreshRate = time.Minute

// ActiveUsers tracks the number of users that have authenticated within the past hour.
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
duration =5 * time.Minute
duration =defaultRefreshRate
}

gauge := prometheus.NewGauge(prometheus.GaugeOpts{
Expand DownExpand Up@@ -72,36 +74,42 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
}

// Workspaces tracks the total number of workspaces with labels on status.
func Workspaces(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
func Workspaces(ctx context.Context,logger slog.Logger,registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
duration =5 * time.Minute
duration =defaultRefreshRate
}

gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
workspaceLatestBuildTotals := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "api",
Name: "workspace_latest_build_total",
Help: "Thelatestworkspace buildswith a status.",
Help: "Thecurrent number ofworkspace buildsby status.",
}, []string{"status"})
err := registerer.Register(gauge)
if err != nil {
if err := registerer.Register(workspaceLatestBuildTotals); err != nil {
return nil, err
}

workspaceLatestBuildStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Name: "workspace_latest_build_status",
Help: "The current workspace statuses by template, transition, and owner.",
}, []string{"status", "template_name", "template_version", "workspace_owner", "workspace_transition"})
if err := registerer.Register(workspaceLatestBuildStatuses); err != nil {
return nil, err
}
// This exists so the prometheus metric exports immediately when set.
// It helps with tests so they don't have to wait for a tick.
gauge.WithLabelValues("pending").Set(0)

ctx, cancelFunc := context.WithCancel(ctx)
done := make(chan struct{})

// Use time.Nanosecond to force an initial tick. It will be reset to the
// correct duration after executing once.
ticker := time.NewTicker(time.Nanosecond)
doTick := func() {
defer ticker.Reset(duration)

updateWorkspaceTotals := func() {
builds, err := db.GetLatestWorkspaceBuilds(ctx)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
// clear all series if there are no database entries
workspaceLatestBuildTotals.Reset()
}

logger.Warn(ctx, "failed to load latest workspace builds", slog.Error(err))
return
}
jobIDs := make([]uuid.UUID, 0, len(builds))
Expand All@@ -110,16 +118,53 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
}
jobs, err := db.GetProvisionerJobsByIDs(ctx, jobIDs)
if err != nil {
ids := make([]string, 0, len(jobIDs))
for _, id := range jobIDs {
ids = append(ids, id.String())
}

logger.Warn(ctx, "failed to load provisioner jobs", slog.F("ids", ids), slog.Error(err))
return
}

gauge.Reset()
workspaceLatestBuildTotals.Reset()
for _, job := range jobs {
status := codersdk.ProvisionerJobStatus(job.JobStatus)
gauge.WithLabelValues(string(status)).Add(1)
workspaceLatestBuildTotals.WithLabelValues(string(status)).Add(1)
}
}

updateWorkspaceStatuses := func() {
ws, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{
Deleted: false,
WithSummary: false,
})
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
// clear all series if there are no database entries
workspaceLatestBuildStatuses.Reset()
}

logger.Warn(ctx, "failed to load active workspaces", slog.Error(err))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

nit: maybelogger.Error?

Copy link
ContributorAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

I wouldn't strictly call it an error because it could be an instance ofsql.ErrNoRows which is valid but undesirable. To be the most correct I would make it warn in case ofsql.ErrNoRows and error in all other cases, but that feels unnecessary.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

which is valid but undesirable

Will it also returnsql.ErrNoRows if there are zero workspaces = fresh deployment?

Copy link
ContributorAuthor

@dannykoppingdannykoppingMar 28, 2024
edited
Loading

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

Believe so, yes.
Or if all workspaces have been deleted.

return
}

workspaceLatestBuildStatuses.Reset()
for _, w := range ws {
workspaceLatestBuildStatuses.WithLabelValues(string(w.LatestBuildStatus), w.TemplateName, w.TemplateVersionName.String, w.Username, string(w.LatestBuildTransition)).Add(1)
}
}

// Use time.Nanosecond to force an initial tick. It will be reset to the
// correct duration after executing once.
ticker := time.NewTicker(time.Nanosecond)
doTick := func() {
defer ticker.Reset(duration)

updateWorkspaceTotals()
updateWorkspaceStatuses()
}

go func() {
defer close(done)
defer ticker.Stop()
Expand All@@ -141,7 +186,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
// Agents tracks the total number of workspaces with labels on status.
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMapFn func() *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) {
if duration == 0 {
duration =1 * time.Minute
duration =defaultRefreshRate
}

agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Expand DownExpand Up@@ -330,7 +375,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis

func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) {
if duration == 0 {
duration =1 * time.Minute
duration =defaultRefreshRate
}

if len(aggregateByLabels) == 0 {
Expand Down
Loading

[8]ページ先頭

©2009-2025 Movatter.jp