1
1
package notifications
2
2
3
3
import (
4
+ "fmt"
5
+ "strings"
6
+
4
7
"github.com/prometheus/client_golang/prometheus"
5
8
"github.com/prometheus/client_golang/prometheus/promauto"
6
9
)
7
10
8
11
type Metrics struct {
9
- DispatchedCount * prometheus.CounterVec
10
- TempFailureCount * prometheus.CounterVec
11
- PermFailureCount * prometheus.CounterVec
12
+ DispatchAttempts * prometheus.CounterVec
12
13
RetryCount * prometheus.CounterVec
13
14
14
15
QueuedSeconds * prometheus.HistogramVec
15
16
17
+ InflightDispatches * prometheus.GaugeVec
16
18
DispatcherSendSeconds * prometheus.HistogramVec
17
19
18
20
PendingUpdates prometheus.Gauge
21
+ SyncedUpdates prometheus.Counter
19
22
}
20
23
21
24
const (
22
25
ns = "coderd"
23
26
subsystem = "notifications"
24
27
25
28
LabelMethod = "method"
26
- LabelTemplateID = "template_id"
29
+ LabelTemplateID = "notification_template_id"
30
+ LabelResult = "result"
31
+
32
+ ResultSuccess = "success"
33
+ ResultTempFail = "temp_fail"
34
+ ResultPermFail = "perm_fail"
27
35
)
28
36
29
37
func NewMetrics (reg prometheus.Registerer )* Metrics {
30
38
return & Metrics {
31
- DispatchedCount :promauto .With (reg ).NewCounterVec (prometheus.CounterOpts {
32
- Name :"dispatched_count" ,Namespace :ns ,Subsystem :subsystem ,
33
- Help :"The count of notifications successfully dispatched." ,
34
- }, []string {LabelMethod ,LabelTemplateID }),
35
- TempFailureCount :promauto .With (reg ).NewCounterVec (prometheus.CounterOpts {
36
- Name :"temporary_failures_count" ,Namespace :ns ,Subsystem :subsystem ,
37
- Help :"The count of notifications which failed but have retry attempts remaining." ,
38
- }, []string {LabelMethod ,LabelTemplateID }),
39
- PermFailureCount :promauto .With (reg ).NewCounterVec (prometheus.CounterOpts {
40
- Name :"permanent_failures_count" ,Namespace :ns ,Subsystem :subsystem ,
41
- Help :"The count of notifications which failed and have exceeded their retry attempts." ,
42
- }, []string {LabelMethod ,LabelTemplateID }),
39
+ DispatchAttempts :promauto .With (reg ).NewCounterVec (prometheus.CounterOpts {
40
+ Name :"dispatch_attempts_total" ,Namespace :ns ,Subsystem :subsystem ,
41
+ Help :fmt .Sprintf ("The number of dispatch attempts, aggregated by the result type (%s)" ,
42
+ strings .Join ([]string {ResultSuccess ,ResultTempFail ,ResultPermFail },", " )),
43
+ }, []string {LabelMethod ,LabelTemplateID ,LabelResult }),
43
44
RetryCount :promauto .With (reg ).NewCounterVec (prometheus.CounterOpts {
44
45
Name :"retry_count" ,Namespace :ns ,Subsystem :subsystem ,
45
46
Help :"The count of notification dispatch retry attempts." ,
@@ -48,13 +49,17 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {
48
49
// Aggregating on LabelTemplateID as well would cause a cardinality explosion.
49
50
QueuedSeconds :promauto .With (reg ).NewHistogramVec (prometheus.HistogramOpts {
50
51
Name :"queued_seconds" ,Namespace :ns ,Subsystem :subsystem ,
51
- Buckets : []float64 {1 ,5 ,15 ,30 ,60 ,120 ,300 ,600 ,3600 , 86400 },
52
- Help :"The time elapsed between a notification being enqueued in the store and retrieved forprocessing " +
52
+ Buckets : []float64 {1 ,2. 5 ,5 , 7.5 , 10 , 15 ,20 , 30 ,60 ,120 ,300 ,600 ,3600 },
53
+ Help :"The time elapsed between a notification being enqueued in the store and retrieved fordispatching " +
53
54
"(measures the latency of the notifications system). This should generally be within CODER_NOTIFICATIONS_FETCH_INTERVAL " +
54
55
"seconds; higher values for a sustained period indicates delayed processing and CODER_NOTIFICATIONS_LEASE_COUNT " +
55
56
"can be increased to accommodate this." ,
56
57
}, []string {LabelMethod }),
57
58
59
+ InflightDispatches :promauto .With (reg ).NewGaugeVec (prometheus.GaugeOpts {
60
+ Name :"inflight_dispatches" ,Namespace :ns ,Subsystem :subsystem ,
61
+ Help :"The number of dispatch attempts which are currently in progress." ,
62
+ }, []string {LabelMethod ,LabelTemplateID }),
58
63
// Aggregating on LabelTemplateID as well would cause a cardinality explosion.
59
64
DispatcherSendSeconds :promauto .With (reg ).NewHistogramVec (prometheus.HistogramOpts {
60
65
Name :"dispatcher_send_seconds" ,Namespace :ns ,Subsystem :subsystem ,
@@ -65,7 +70,11 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {
65
70
// Currently no requirement to discriminate between success and failure updates which are pending.
66
71
PendingUpdates :promauto .With (reg ).NewGauge (prometheus.GaugeOpts {
67
72
Name :"pending_updates" ,Namespace :ns ,Subsystem :subsystem ,
68
- Help :"The number of updates waiting to be flushed to the store." ,
73
+ Help :"The number of dispatch attempt results waiting to be flushed to the store." ,
74
+ }),
75
+ SyncedUpdates :promauto .With (reg ).NewCounter (prometheus.CounterOpts {
76
+ Name :"synced_updates_total" ,Namespace :ns ,Subsystem :subsystem ,
77
+ Help :"The number of dispatch attempt results flushed to the store." ,
69
78
}),
70
79
}
71
80
}