diff --git a/internal/instrumentation/prometheus.go b/internal/instrumentation/prometheus.go index 2b216dd..7a48783 100644 --- a/internal/instrumentation/prometheus.go +++ b/internal/instrumentation/prometheus.go @@ -2,6 +2,10 @@ package instrumentation import ( "context" + "reflect" + "sort" + "strings" + "sync" "time" "github.com/99designs/gqlgen/graphql" @@ -13,15 +17,37 @@ const ( exitStatusSuccess = "success" ) +var latencyMapMutex sync.Mutex +var latencyMap = make(map[string]float64, 128) + +type ctxKeyType string + +var ctxKey = ctxKeyType("metricglobal") + var ( - requestStartedCounter prometheusclient.Counter - requestCompletedCounter prometheusclient.Counter - resolverStartedCounter *prometheusclient.CounterVec - resolverCompletedCounter *prometheusclient.CounterVec - timeToResolveField *prometheusclient.HistogramVec - timeToHandleRequest *prometheusclient.HistogramVec + requestStartedCounter prometheusclient.Counter + requestCompletedCounter prometheusclient.Counter + requestFailureCounter *prometheusclient.CounterVec + requestSuccessCounter *prometheusclient.CounterVec + requestComplexityHisto prometheusclient.Histogram + resolverStartedCounter *prometheusclient.CounterVec + resolverCompletedCounter *prometheusclient.CounterVec + timeToResolveField *prometheusclient.HistogramVec + timeToHandleRequest *prometheusclient.HistogramVec + timeToHandleRequestQueryMAvg *prometheusclient.GaugeVec + timeToHandleRequestQueryTotal *prometheusclient.CounterVec + responseSizeMAvg *prometheusclient.GaugeVec + responseSizeTotal *prometheusclient.CounterVec ) +// Globals is monitoring data. +type Globals struct { + sync.Mutex + + Queries []string + Failed bool +} + // Register registers the prometheus metrics. func Register() { requestStartedCounter = prometheusclient.NewCounter( @@ -38,6 +64,29 @@ func Register() { }, ) + requestFailureCounter = prometheusclient.NewCounterVec( + prometheusclient.CounterOpts{ + Name: "graphql_request_failures", + Help: "Number of requests with either success or failure status.", + }, + []string{"queries"}, + ) + requestSuccessCounter = prometheusclient.NewCounterVec( + prometheusclient.CounterOpts{ + Name: "graphql_request_successes", + Help: "Number of requests with either success or failure status.", + }, + []string{"queries"}, + ) + + requestComplexityHisto = prometheusclient.NewHistogram( + prometheusclient.HistogramOpts{ + Name: "graphql_request_complexity", + Help: "The complexity value of incoming queries.", + Buckets: prometheusclient.LinearBuckets(100, 25, 9), + }, + ) + resolverStartedCounter = prometheusclient.NewCounterVec( prometheusclient.CounterOpts{ Name: "graphql_resolver_started_total", @@ -66,13 +115,50 @@ func Register() { Buckets: prometheusclient.ExponentialBuckets(1, 2, 11), }, []string{"exitStatus"}) + timeToHandleRequestQueryMAvg = prometheusclient.NewGaugeVec( + prometheusclient.GaugeOpts{ + Name: "graphql_request_query_duration_mavg_ms", + Help: "Moving average of graphql request query duration for a query set.", + }, + []string{"queries"}, + ) + timeToHandleRequestQueryTotal = prometheusclient.NewCounterVec( + prometheusclient.CounterOpts{ + Name: "graphql_request_query_duration_total_ms", + Help: "Total duration spent responding to a query.", + }, + []string{"queries"}, + ) + + responseSizeMAvg = prometheusclient.NewGaugeVec( + prometheusclient.GaugeOpts{ + Name: "graphql_response_size_mavg_bytes", + Help: "Moving average of graphql response size for a query set.", + }, + []string{"queries"}, + ) + responseSizeTotal = prometheusclient.NewCounterVec( + prometheusclient.CounterOpts{ + Name: "graphql_response_size_total_bytes", + Help: "Total bytes of response data to a query.", + }, + []string{"queries"}, + ) + prometheusclient.MustRegister( requestStartedCounter, requestCompletedCounter, + requestSuccessCounter, + requestFailureCounter, + requestComplexityHisto, resolverStartedCounter, resolverCompletedCounter, timeToResolveField, timeToHandleRequest, + timeToHandleRequestQueryMAvg, + timeToHandleRequestQueryTotal, + responseSizeMAvg, + responseSizeTotal, ) } @@ -80,10 +166,17 @@ func Register() { func UnRegister() { prometheusclient.Unregister(requestStartedCounter) prometheusclient.Unregister(requestCompletedCounter) + prometheusclient.Unregister(requestSuccessCounter) + prometheusclient.Unregister(requestFailureCounter) + prometheusclient.Unregister(requestComplexityHisto) prometheusclient.Unregister(resolverStartedCounter) prometheusclient.Unregister(resolverCompletedCounter) prometheusclient.Unregister(timeToResolveField) prometheusclient.Unregister(timeToHandleRequest) + prometheusclient.Unregister(timeToHandleRequestQueryMAvg) + prometheusclient.Unregister(timeToHandleRequestQueryTotal) + prometheusclient.Unregister(responseSizeMAvg) + prometheusclient.Unregister(responseSizeTotal) } // ResolverMiddleware is a resolver middleware that logs metrics for prometheus. @@ -95,17 +188,36 @@ func ResolverMiddleware() graphql.FieldMiddleware { observerStart := time.Now() + path := rctx.Path() + if len(path) == 1 { + if globals, ok := ctx.Value(ctxKey).(*Globals); ok { + globals.Lock() + if rctx.Field.Field != nil { + globals.Queries = append(globals.Queries, rctx.Field.Name+"("+argsLabel(rctx.Args)+")") + } else if name, ok := path[0].(string); ok { + globals.Queries = append(globals.Queries, name) + } + globals.Unlock() + } + } + res, err := next(ctx) var exitStatus string if err != nil { exitStatus = existStatusFailure + + if globals, ok := ctx.Value(ctxKey).(*Globals); ok { + globals.Lock() + globals.Failed = true + globals.Unlock() + } } else { exitStatus = exitStatusSuccess } - timeToResolveField.With(prometheusclient.Labels{"exitStatus": exitStatus}). - Observe(float64(time.Since(observerStart).Nanoseconds() / int64(time.Millisecond))) + exitStatusLabel := prometheusclient.Labels{"exitStatus": exitStatus} + timeToResolveField.With(exitStatusLabel).Observe(float64(time.Since(observerStart).Nanoseconds() / int64(time.Millisecond))) resolverCompletedCounter.WithLabelValues(rctx.Object, rctx.Field.Name).Inc() @@ -113,31 +225,122 @@ func ResolverMiddleware() graphql.FieldMiddleware { } } -// RequestMiddleware is a request middleware that logs metrics for prometheus. +// RequestMiddleware is a request middleware that logs metrics for prometheus. It looks busy, but the +// overhead is less than 10µs. func RequestMiddleware() graphql.RequestMiddleware { return func(ctx context.Context, next func(ctx context.Context) []byte) []byte { requestStartedCounter.Inc() - observerStart := time.Now() + // Store some values in a global context. + globals := Globals{} - res := next(ctx) + // Measure the time to serve the request + observerStart := time.Now() + res := next(context.WithValue(ctx, ctxKey, &globals)) + observedDuration := time.Since(observerStart) + observedDurationMS := float64(observedDuration.Nanoseconds()) / float64(time.Millisecond) - rctx := graphql.GetResolverContext(ctx) reqCtx := graphql.GetRequestContext(ctx) - errList := reqCtx.GetErrors(rctx) - var exitStatus string - if len(errList) > 0 { - exitStatus = existStatusFailure - } else { - exitStatus = exitStatusSuccess + // exitStatus label + exitStatusLabel := prometheusclient.Labels{"exitStatus": exitStatusSuccess} + if globals.Failed { + exitStatusLabel["exitStatus"] = existStatusFailure } - timeToHandleRequest.With(prometheusclient.Labels{"exitStatus": exitStatus}). - Observe(float64(time.Since(observerStart).Nanoseconds() / int64(time.Millisecond))) + // queries label + sort.Strings(globals.Queries) + queriesStr := strings.Join(globals.Queries, ";") + queriesLabel := prometheusclient.Labels{ + "queries": queriesStr, + } + // Write metrics + timeToHandleRequest.With(exitStatusLabel).Observe(observedDurationMS) requestCompletedCounter.Inc() + if len(globals.Queries) > 0 { + if globals.Failed { + requestFailureCounter.With(queriesLabel).Inc() + } else { + requestSuccessCounter.With(queriesLabel).Inc() + } + + timeToHandleRequestQueryMAvg.With(queriesLabel).Set(movingAvgLatency("ms:"+queriesStr, observedDurationMS)) + timeToHandleRequestQueryTotal.With(queriesLabel).Add(observedDurationMS) + responseSizeMAvg.With(queriesLabel).Set(movingAvgLatency("bytes:"+queriesStr, float64(len(res)))) + responseSizeTotal.With(queriesLabel).Add(float64(len(res))) + } + requestComplexityHisto.Observe(float64(reqCtx.OperationComplexity)) return res } } + +func movingAvgLatency(key string, sample float64) float64 { + latencyMapMutex.Lock() + latency, ok := latencyMap[key] + if ok { + latency = (latency * 0.9) + (sample * 0.1) + latencyMap[key] = latency + } else { + latencyMap[key] = sample + latency = sample + } + latencyMapMutex.Unlock() + + return latency +} + +func argsLabel(args map[string]interface{}) string { + result := "" + + for key, value := range args { + if len(result) > 0 { + result += "," + } + + result = key + + rv := reflect.ValueOf(value) + for rv.Kind() == reflect.Ptr { + rv = rv.Elem() + } + + if rv.Kind() != reflect.Struct { + continue + } + + result += "{" + + first := true + for i := 0; i < rv.NumField(); i++ { + rf := rv.Field(i) + if (rf.Kind() == reflect.Ptr || rf.Kind() == reflect.Interface) && rf.IsNil() { + continue + } + + rtf := rv.Type().Field(i) + + // Empty limits are as good as no limit + if rtf.Name == "Limit" && rf.Type().Kind() == reflect.Int && rf.Int() <= 0 { + continue + } + + if !first { + result += "," + } + first = false + + jsonTag := rtf.Tag.Get("json") + if jsonTag != "" { + result += jsonTag + } else { + result += strings.ToLower(rtf.Name[0:1]) + rtf.Name[1:] + } + } + + result += "}" + } + + return result +}