Jul 11, 2024 · Jul 3, 2024 · Jul 5, 2024 · Jul 5, 2024 · Jul 8, 2024 · Jul 8, 2024
diff --git a/coderd/notifications/metrics.go b/coderd/notifications/metrics.go

 func NewMetrics(reg prometheus.Registerer) *Metrics {
 return &Metrics{
 DispatchedCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{Name: "dispatched_count", Namespace: ns, Subsystem: subsystem,
 DispatchedCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
 Name: "dispatched_count", Namespace: ns, Subsystem: subsystem,
 Help: "The count of notifications successfully dispatched.",
 }, []string{LabelMethod, LabelTemplateID}),
 TempFailureCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{Name: "temporary_failures_count", Namespace: ns, Subsystem: subsystem,
 TempFailureCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
 Name: "temporary_failures_count", Namespace: ns, Subsystem: subsystem,
 Help: "The count of notifications which failed but have retry attempts remaining.",
 }, []string{LabelMethod, LabelTemplateID}),
 PermFailureCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{Name: "permanent_failures_count", Namespace: ns, Subsystem: subsystem,
 PermFailureCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
 Name: "permanent_failures_count", Namespace: ns, Subsystem: subsystem,
 Help: "The count of notifications which failed and have exceeded their retry attempts.",
 }, []string{LabelMethod, LabelTemplateID}),
 RetryCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{Name: "retry_count", Namespace: ns, Subsystem: subsystem,
 RetryCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
 Name: "retry_count", Namespace: ns, Subsystem: subsystem,
 Help: "The count of notification dispatch retry attempts.",
 }, []string{LabelMethod, LabelTemplateID}),

 // Aggregating on LabelTemplateID as well would cause a cardinality explosion.
 QueuedSeconds: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{Name: "queued_seconds", Namespace: ns, Subsystem: subsystem,
 QueuedSeconds: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
 Name: "queued_seconds", Namespace: ns, Subsystem: subsystem,
 Buckets: []float64{0.1, 1, 5, 15, 30, 60, 120, 300, 600, 3600, 86400},
 Help: "The time elapsed between a notification being enqueued in the store and retrieved for processing " +
 "(measures the latency of the notifications system). This should generally be within CODER_NOTIFICATIONS_FETCH_INTERVAL " +
 }, []string{LabelMethod}),

 // Aggregating on LabelTemplateID as well would cause a cardinality explosion.
 DispatcherSendSeconds: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{Name: "dispatcher_send_seconds", Namespace: ns, Subsystem: subsystem,
 DispatcherSendSeconds: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
 Name: "dispatcher_send_seconds", Namespace: ns, Subsystem: subsystem,
 Buckets: []float64{0.001, 0.05, 0.1, 0.5, 1, 2, 5, 10, 15, 30, 60, 120},
 Help:    "The time taken to dispatch notifications.",
 }, []string{LabelMethod}),

 // Currently no requirement to discriminate between success and failure updates which are pending.
 PendingUpdates: promauto.With(reg).NewGauge(prometheus.GaugeOpts{Name: "pending_updates", Namespace: ns, Subsystem: subsystem,
 PendingUpdates: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
 Name: "pending_updates", Namespace: ns, Subsystem: subsystem,
 Help: "The number of updates waiting to be flushed to the store.",
 }),
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -28,21 +28,26 @@ const (

		func NewMetrics(reg prometheus.Registerer) *Metrics {
		return &Metrics{
		DispatchedCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{Name: "dispatched_count", Namespace: ns, Subsystem: subsystem,
		DispatchedCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
		Name: "dispatched_count", Namespace: ns, Subsystem: subsystem,
Copy link Contributor spikecurtisJul 8, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. accumulating counts should end in the units andtotal as in`coderd_notifications_successful_deliveries_total`, where "deliveries" is the unit (could use "dispatch_attempts", or something similar). https://prometheus.io/docs/practices/naming/ Copy link ContributorAuthor dannykoppingJul 9, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Wow, 4 months out of Grafana Labs and I forget all my Prometheus manners. Thanks!
		Help: "The count of notifications successfully dispatched.",
		}, []string{LabelMethod, LabelTemplateID}),
		TempFailureCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{Name: "temporary_failures_count", Namespace: ns, Subsystem: subsystem,
		TempFailureCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
		Name: "temporary_failures_count", Namespace: ns, Subsystem: subsystem,
		Help: "The count of notifications which failed but have retry attempts remaining.",
		}, []string{LabelMethod, LabelTemplateID}),
		PermFailureCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{Name: "permanent_failures_count", Namespace: ns, Subsystem: subsystem,
		PermFailureCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
		Name: "permanent_failures_count", Namespace: ns, Subsystem: subsystem,
		Help: "The count of notifications which failed and have exceeded their retry attempts.",
		}, []string{LabelMethod, LabelTemplateID}),
Copy link Contributor spikecurtisJul 8, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. DispatchCount, TempFailureCount, and PermFailureCount might make sense as a single metric with different "disposition" (or "result") labels. The help text refers to them as "notifications", but I think that's misleading. What we are counting is "notification attempts" or "dispatch attempts", which might individually end up in success or some kind of failure. In particular, in the case of failure and retry, the number of delivery attempts can exceed the number of notifications, so it's important to make the distinction. Prometheus guidelines say that something should be a single metric with labels (vs multiple metrics) when the sum or average of the label is sensible and coherent. Filtering by method and summing across disposition would be useful, for example, to track the number of web requests we are making via Webhook delivery. Copy link ContributorAuthor dannykoppingJul 9, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Super idea! Simplifies things a lot.
		RetryCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{Name: "retry_count", Namespace: ns, Subsystem: subsystem,
		RetryCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
		Name: "retry_count", Namespace: ns, Subsystem: subsystem,
		Help: "The count of notification dispatch retry attempts.",
		}, []string{LabelMethod, LabelTemplateID}),

		// Aggregating on LabelTemplateID as well would cause a cardinality explosion.
		QueuedSeconds: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{Name: "queued_seconds", Namespace: ns, Subsystem: subsystem,
		QueuedSeconds: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
		Name: "queued_seconds", Namespace: ns, Subsystem: subsystem,
		Buckets: []float64{0.1, 1, 5, 15, 30, 60, 120, 300, 600, 3600, 86400},
		Help: "The time elapsed between a notification being enqueued in the store and retrieved for processing " +
		"(measures the latency of the notifications system). This should generally be within CODER_NOTIFICATIONS_FETCH_INTERVAL " +
Expand All		@@ -51,13 +56,15 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {
		}, []string{LabelMethod}),

		// Aggregating on LabelTemplateID as well would cause a cardinality explosion.
		DispatcherSendSeconds: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{Name: "dispatcher_send_seconds", Namespace: ns, Subsystem: subsystem,
		DispatcherSendSeconds: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
dannykopping marked this conversation as resolved. Show resolvedHide resolved
		Name: "dispatcher_send_seconds", Namespace: ns, Subsystem: subsystem,
		Buckets: []float64{0.001, 0.05, 0.1, 0.5, 1, 2, 5, 10, 15, 30, 60, 120},
Copy link Contributor spikecurtisJul 8, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. should be`0.001, 0.005, 0.01, 0.05` on the bottom end, right? Copy link ContributorAuthor dannykoppingJul 9, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. I don't see much value in distinguishing between 1ms, 5ms, and 10ms; for most systems the latency will be higher than that (I'm assuming). The high end is what we're more concerned about here since that's indicative of a problem. In other words: this metric is more for diagnosing issues than measuring performance of the endpoint/server. Copy link Contributor spikecurtisJul 15, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Dispatching, say, a webhook, within the same cluster could be a few milliseconds latency. Same geographic region could be 10-20ms. Almost nothing is going to be 1ms unless it's extremely simple and extremely close, but in 50ms you could be dispatching across an entire continent. With buckets like this you could have something that normally returns in 15ms take 3x its normal time and be close to falling over and this metric would not clock it. That seems much more important than 30s, 60s, 120s. Copy link ContributorAuthor dannykoppingJul 18, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. I hear you and I can see where you're coming from. Let's wait for users to ask for this. I think there's sufficient coverage for now, and the additional datapoints are likely not going to be very useful for the majority of operators IMHO.
		Help: "The time taken to dispatch notifications.",
		}, []string{LabelMethod}),

		// Currently no requirement to discriminate between success and failure updates which are pending.
		PendingUpdates: promauto.With(reg).NewGauge(prometheus.GaugeOpts{Name: "pending_updates", Namespace: ns, Subsystem: subsystem,
		PendingUpdates: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
		Name: "pending_updates", Namespace: ns, Subsystem: subsystem,
		Help: "The number of updates waiting to be flushed to the store.",
Copy link Contributor spikecurtisJul 8, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Sounds too generic to me. "The number of delivery attempt results waiting to be flushed to the store" ? dannykopping reacted with thumbs up emoji
		}),
Copy link Contributor spikecurtisJul 8, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. A really useful Gauge is the number of "in progress" delivery attempts. You increment it when you kick off the delivery goroutine and decrement it when you send to`success` or`failure` channels. Can help spot hung handlers/leaked goroutines dannykopping reacted with thumbs up emoji Copy link ContributorAuthor dannykoppingJul 9, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Good idea! Copy link Contributor spikecurtisJul 8, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Also a count of success/failure updates flushed to the database. dannykopping reacted with thumbs up emoji
		}
Expand Down