- Notifications
You must be signed in to change notification settings - Fork928
feat(coderd): add prometheus metrics to servertailnet#11988
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -14,6 +14,7 @@ import ( | ||
"time" | ||
"github.com/google/uuid" | ||
"github.com/prometheus/client_golang/prometheus" | ||
"go.opentelemetry.io/otel/trace" | ||
"golang.org/x/xerrors" | ||
"tailscale.com/derp" | ||
@@ -97,6 +98,18 @@ func NewServerTailnet( | ||
agentConnectionTimes: map[uuid.UUID]time.Time{}, | ||
agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{}, | ||
transport: tailnetTransport.Clone(), | ||
connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Namespace: "coder", | ||
Subsystem: "servertailnet", | ||
Name: "open_connections", | ||
Help: "Total number of TCP connections currently open to workspace agents.", | ||
}, []string{"network"}), | ||
totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{ | ||
Namespace: "coder", | ||
Subsystem: "servertailnet", | ||
Name: "connections_total", | ||
Help: "Total number of TCP connections made to workspace agents.", | ||
}, []string{"network"}), | ||
} | ||
tn.transport.DialContext = tn.dialContext | ||
// These options are mostly just picked at random, and they can likely be | ||
@@ -170,6 +183,16 @@ func NewServerTailnet( | ||
return tn, nil | ||
} | ||
func (s *ServerTailnet) Describe(descs chan<- *prometheus.Desc) { | ||
s.connsPerAgent.Describe(descs) | ||
s.totalConns.Describe(descs) | ||
} | ||
func (s *ServerTailnet) Collect(metrics chan<- prometheus.Metric) { | ||
s.connsPerAgent.Collect(metrics) | ||
s.totalConns.Collect(metrics) | ||
} | ||
func (s *ServerTailnet) expireOldAgents() { | ||
const ( | ||
tick = 5 * time.Minute | ||
@@ -304,6 +327,9 @@ type ServerTailnet struct { | ||
agentTickets map[uuid.UUID]map[uuid.UUID]struct{} | ||
transport *http.Transport | ||
connsPerAgent *prometheus.GaugeVec | ||
totalConns *prometheus.CounterVec | ||
} | ||
func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy { | ||
@@ -349,7 +375,18 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) ( | ||
return nil, xerrors.Errorf("no agent id attached") | ||
} | ||
nc, err := s.DialAgentNetConn(ctx, agentID, network, addr) | ||
if err != nil { | ||
return nil, err | ||
} | ||
s.connsPerAgent.WithLabelValues("tcp").Inc() | ||
s.totalConns.WithLabelValues("tcp").Inc() | ||
return &instrumentedConn{ | ||
Conn: nc, | ||
agentID: agentID, | ||
connsPerAgent: s.connsPerAgent, | ||
}, nil | ||
} | ||
func (s *ServerTailnet) ensureAgent(agentID uuid.UUID) error { | ||
@@ -455,3 +492,18 @@ func (s *ServerTailnet) Close() error { | ||
<-s.derpMapUpdaterClosed | ||
return nil | ||
} | ||
type instrumentedConn struct { | ||
net.Conn | ||
agentID uuid.UUID | ||
closeOnce sync.Once | ||
connsPerAgent *prometheus.GaugeVec | ||
} | ||
func (c *instrumentedConn) Close() error { | ||
c.closeOnce.Do(func() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. are network connectionsalways explicitly closed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. This one I'm not 100% sure on. Admittedly, I found this idea from a stackoverflow post which seemed to work for a couple other people. Was planning to get this into dev and monitor to make sure it works as intended with a lot more usage than I can reproduce myself. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I guess maybe theyshould be and this gauge will tell us if there's a leak... | ||
c.connsPerAgent.WithLabelValues("tcp").Dec() | ||
}) | ||
return c.Conn.Close() | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
package testutil | ||
import ( | ||
"testing" | ||
dto "github.com/prometheus/client_model/go" | ||
"github.com/stretchr/testify/require" | ||
) | ||
func PromGaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { | ||
t.Helper() | ||
for _, family := range metrics { | ||
if family.GetName() != name { | ||
continue | ||
} | ||
ms := family.GetMetric() | ||
metricsLoop: | ||
for _, m := range ms { | ||
require.Equal(t, len(label), len(m.GetLabel())) | ||
for i, lv := range label { | ||
if lv != m.GetLabel()[i].GetValue() { | ||
continue metricsLoop | ||
} | ||
} | ||
return value == m.GetGauge().GetValue() | ||
} | ||
} | ||
return false | ||
} | ||
func PromCounterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool { | ||
t.Helper() | ||
for _, family := range metrics { | ||
if family.GetName() != name { | ||
continue | ||
} | ||
ms := family.GetMetric() | ||
metricsLoop: | ||
for _, m := range ms { | ||
require.Equal(t, len(label), len(m.GetLabel())) | ||
for i, lv := range label { | ||
if lv != m.GetLabel()[i].GetValue() { | ||
continue metricsLoop | ||
} | ||
} | ||
return value == m.GetCounter().GetValue() | ||
} | ||
} | ||
return false | ||
} |