From 084a3cddc613bdaa3a0cbebcef69d8e110284ea0 Mon Sep 17 00:00:00 2001 From: Daniel Dreier Date: Sun, 1 Sep 2024 19:52:29 -0500 Subject: [PATCH] Consul registry performance improvements (#928) Two changes: 1. Limit the initial list of Health Checks to those with the configured Tag Prefix 2. Reduce the amount of loops done in passingServices --- registry/consul/passing.go | 90 +++++++++++++------------------------- registry/consul/service.go | 24 +++++++++- 2 files changed, 53 insertions(+), 61 deletions(-) diff --git a/registry/consul/passing.go b/registry/consul/passing.go index e566c6a9b..3537b7ce1 100644 --- a/registry/consul/passing.go +++ b/registry/consul/passing.go @@ -7,28 +7,48 @@ import ( "github.com/hashicorp/consul/api" ) +// passingServices takes a list of Consul Health Checks and only returns ones where the overall health of +// the Service Instance is passing. This includes the health of the Node that the Services Instance runs on. func passingServices(checks []*api.HealthCheck, status []string, strict bool) []*api.HealthCheck { var p []*api.HealthCheck + +CHECKS: for _, svc := range checks { if !isServiceCheck(svc) { continue } - total, passing := countChecks(svc, checks, status) + var total, passing int + + for _, c := range checks { + if svc.Node == c.Node { + if svc.ServiceID == c.ServiceID { + total++ + if hasStatus(c, status) { + passing++ + } + } + if c.CheckID == "serfHealth" && c.Status == "critical" { + log.Printf("[DEBUG] consul: Skipping service %q since agent on node %q is down: %s", c.ServiceID, c.Node, c.Output) + continue CHECKS + } + if c.CheckID == "_node_maintenance" { + log.Printf("[DEBUG] consul: Skipping service %q since node %q is in maintenance mode: %s", c.ServiceID, c.Node, c.Output) + continue CHECKS + } + if c.CheckID == "_service_maintenance:"+svc.ServiceID && c.Status == "critical" { + log.Printf("[DEBUG] consul: Skipping service %q since it is in maintenance mode: %s", svc.ServiceID, c.Output) + continue CHECKS + } + } + } + if passing == 0 { continue } if strict && total != passing { continue } - if isAgentCritical(svc, checks) { - continue - } - if isNodeInMaintenance(svc, checks) { - continue - } - if isServiceInMaintenance(svc, checks) { - continue - } + p = append(p, svc) } @@ -43,56 +63,6 @@ func isServiceCheck(c *api.HealthCheck) bool { !strings.HasPrefix(c.CheckID, "_service_maintenance:") } -// isAgentCritical returns true if the agent on the node on which the service -// runs is critical. -func isAgentCritical(svc *api.HealthCheck, checks []*api.HealthCheck) bool { - for _, c := range checks { - if svc.Node == c.Node && c.CheckID == "serfHealth" && c.Status == "critical" { - log.Printf("[DEBUG] consul: Skipping service %q since agent on node %q is down: %s", c.ServiceID, c.Node, c.Output) - return true - } - } - return false -} - -// isNodeInMaintenance returns true if the node on which the service runs is in -// maintenance mode. -func isNodeInMaintenance(svc *api.HealthCheck, checks []*api.HealthCheck) bool { - for _, c := range checks { - if svc.Node == c.Node && c.CheckID == "_node_maintenance" { - log.Printf("[DEBUG] consul: Skipping service %q since node %q is in maintenance mode: %s", c.ServiceID, c.Node, c.Output) - return true - } - } - return false -} - -// isServiceInMaintenance returns true if the service instance is in -// maintenance mode. -func isServiceInMaintenance(svc *api.HealthCheck, checks []*api.HealthCheck) bool { - for _, c := range checks { - if svc.Node == c.Node && c.CheckID == "_service_maintenance:"+svc.ServiceID && c.Status == "critical" { - log.Printf("[DEBUG] consul: Skipping service %q since it is in maintenance mode: %s", svc.ServiceID, c.Output) - return true - } - } - return false -} - -// countChecks counts the number of service checks exist for a given service -// and how many of them are passing. -func countChecks(svc *api.HealthCheck, checks []*api.HealthCheck, status []string) (total int, passing int) { - for _, c := range checks { - if svc.Node == c.Node && svc.ServiceID == c.ServiceID { - total++ - if hasStatus(c, status) { - passing++ - } - } - } - return -} - // hasStatus returns true if the health check status is one of the given // values. func hasStatus(c *api.HealthCheck, status []string) bool { diff --git a/registry/consul/service.go b/registry/consul/service.go index 363363c14..37e84bbe5 100644 --- a/registry/consul/service.go +++ b/registry/consul/service.go @@ -48,8 +48,11 @@ func (w *ServiceMonitor) Watch(updates chan string) { } log.Printf("[DEBUG] consul: Health changed to #%d", meta.LastIndex) + prefixedChecks := checksWithTagPrefix(w.config.TagPrefix, checks) + log.Printf("[DEBUG] consul: only %d of %d checks have the configured tag prefix", len(prefixedChecks), len(checks)) + // determine which services have passing health checks - passing := passingServices(checks, w.config.ServiceStatus, w.strict) + passing := passingServices(prefixedChecks, w.config.ServiceStatus, w.strict) // build the config for the passing services updates <- w.makeConfig(passing) @@ -138,3 +141,22 @@ func (w *ServiceMonitor) serviceConfig(name string, passing map[string]bool) (co } return config } + +// checksWithTagPrefix filters a list of Consul Health Checks to only the Checks with a Tag that begins with the prefix +func checksWithTagPrefix(prefix string, checks api.HealthChecks) api.HealthChecks { + checksWithPrefix := make(api.HealthChecks, 0, len(checks)) + for _, c := range checks { + if c.CheckID == "serfHealth" || c.CheckID == "_node_maintenance" || strings.HasPrefix(c.CheckID, "_service_maintenance") { + checksWithPrefix = append(checksWithPrefix, c) + continue + } + for _, t := range c.ServiceTags { + if strings.HasPrefix(t, prefix) { + checksWithPrefix = append(checksWithPrefix, c) + break + } + } + } + + return checksWithPrefix +}