Skip to content

Commit

Permalink
Alloy-Mixin: allow k8s cluster and alloy cluster disable, add logs da…
Browse files Browse the repository at this point in the history
…shboard (#808)
  • Loading branch information
gaantunes authored May 24, 2024
1 parent ae92ce1 commit a30cc8a
Show file tree
Hide file tree
Showing 17 changed files with 965 additions and 684 deletions.
21 changes: 16 additions & 5 deletions operations/alloy-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
local clusterAlerts = (import './alerts/clustering.libsonnet');
local controllerAlerts = (import './alerts/controller.libsonnet');
local openTelemetryAlerts = (import './alerts/opentelemetry.libsonnet');

{
local alloyClusterAlerts = [clusterAlerts.newAlloyClusterAlertsGroup($._config.enableK8sCluster)],

local otherAlerts = [
controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster),
openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster)
],

prometheusAlerts+: {
groups+: [
(import './alerts/clustering.libsonnet'),
(import './alerts/controller.libsonnet'),
(import './alerts/opentelemetry.libsonnet'),
],
groups+:
if $._config.enableAlloyCluster then
alloyClusterAlerts + otherAlerts
else
otherAlerts
},
}
144 changes: 84 additions & 60 deletions operations/alloy-mixin/alerts/clustering.libsonnet
Original file line number Diff line number Diff line change
@@ -1,67 +1,91 @@
local alert = import './utils/alert.jsonnet';

alert.newGroup(
'alloy_clustering',
[
// Cluster not converging.
alert.newRule(
'ClusterNotConverging',
'stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0',
'Cluster is not converging: nodes report different number of peers in the cluster.',
'10m',
),
{
newAlloyClusterAlertsGroup(enableK8sCluster=true)::
alert.newGroup(
'alloy_clustering',
[
// Cluster not converging.
alert.newRule(
'ClusterNotConverging',
if enableK8sCluster then
'stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0'
else
'stddev by (job) (sum without (state) (cluster_node_peers)) != 0',
'Cluster is not converging: nodes report different number of peers in the cluster.',
'10m',
),

alert.newRule(
'ClusterNodeCountMismatch',
// Assert that the number of known peers (regardless of state) reported by each
// Alloy instance matches the number of running Alloy instances in the
// same cluster and namespace as reported by a count of Prometheus
// metrics.
|||
sum without (state) (cluster_node_peers) !=
on (cluster, namespace, job) group_left
count by (cluster, namespace, job) (cluster_node_info)
|||,
'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.',
'15m',
),
alert.newRule(
'ClusterNodeCountMismatch',
// Assert that the number of known peers (regardless of state) reported by each
// Alloy instance matches the number of running Alloy instances in the
// same cluster and namespace as reported by a count of Prometheus
// metrics.
if enableK8sCluster then |||
sum without (state) (cluster_node_peers) !=
on (cluster, namespace, job) group_left
count by (cluster, namespace, job) (cluster_node_info)
||| else |||
sum without (state) (cluster_node_peers) !=
on (job) group_left
count by (job) (cluster_node_info)
|||
,
'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.',
'15m',
),
// Nodes health score is not zero.
alert.newRule(
'ClusterNodeUnhealthy',
|||
cluster_node_gossip_health_score > 0
|||,
'Cluster node is reporting a gossip protocol health score > 0.',
'10m',
),
// Nodes health score is not zero.
alert.newRule(
'ClusterNodeUnhealthy',
|||
cluster_node_gossip_health_score > 0
|||,
'Cluster node is reporting a gossip protocol health score > 0.',
'10m',
),
// Node tried to join the cluster with an already-present node name.
alert.newRule(
'ClusterNodeNameConflict',
'sum by (cluster, namespace) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0',
'A node tried to join the cluster with a name conflicting with an existing peer.',
'10m',
),
// Node tried to join the cluster with an already-present node name.
alert.newRule(
'ClusterNodeNameConflict',
if enableK8sCluster then
'sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0'
else
'sum by (job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0'
,
'A node tried to join the cluster with a name conflicting with an existing peer.',
'10m',
),
// Node stuck in Terminating state.
alert.newRule(
'ClusterNodeStuckTerminating',
'sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"}) > 0',
'Cluster node stuck in Terminating state.',
'10m',
),
// Node stuck in Terminating state.
alert.newRule(
'ClusterNodeStuckTerminating',
if enableK8sCluster then
'sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0'
else
'sum by (job, instance) (cluster_node_peers{state="terminating"}) > 0'
,
'Cluster node stuck in Terminating state.',
'10m',
),
// Nodes are not using the same configuration file.
alert.newRule(
'ClusterConfigurationDrift',
|||
count without (sha256) (
max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace) cluster_node_info)
) > 1
|||,
'Cluster nodes are not using the same configuration file.',
'5m',
),
]
)
// Nodes are not using the same configuration file.
alert.newRule(
'ClusterConfigurationDrift',
if enableK8sCluster then |||
count without (sha256) (
max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info)
) > 1
||| else |||
count without (sha256) (
max by (sha256, job) (alloy_config_hash and on(job) cluster_node_info)
) > 1
|||
,
'Cluster nodes are not using the same configuration file.',
'5m',
),
]
)
}
49 changes: 30 additions & 19 deletions operations/alloy-mixin/alerts/controller.libsonnet
Original file line number Diff line number Diff line change
@@ -1,22 +1,33 @@
local alert = import './utils/alert.jsonnet';

alert.newGroup(
'alloy_controller',
[
// Component evaluations are taking too long, which can lead to e.g. stale targets.
alert.newRule(
'SlowComponentEvaluations',
'sum by (cluster, namespace, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0',
'Component evaluations are taking too long.',
'15m',
),
{
newControllerAlertsGroup(enableK8sCluster=true):
alert.newGroup(
'alloy_controller',
[
// Component evaluations are taking too long, which can lead to e.g. stale targets.
alert.newRule(
'SlowComponentEvaluations',
if enableK8sCluster then
'sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0'
else
'sum by (job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0'
,
'Component evaluations are taking too long.',
'15m',
),

// Unhealthy components detected.
alert.newRule(
'UnhealthyComponents',
'sum by (cluster, namespace) (alloy_component_controller_running_components{health_type!="healthy"}) > 0',
'Unhealthy components detected.',
'15m',
),
]
)
// Unhealthy components detected.
alert.newRule(
'UnhealthyComponents',
if enableK8sCluster then
'sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0'
else
'sum by (job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0'
,
'Unhealthy components detected.',
'15m',
),
]
)
}
55 changes: 33 additions & 22 deletions operations/alloy-mixin/alerts/opentelemetry.libsonnet
Original file line number Diff line number Diff line change
@@ -1,25 +1,36 @@
local alert = import './utils/alert.jsonnet';

alert.newGroup(
'alloy_otelcol',
[
// An otelcol.exporter component rcould not push some spans to the pipeline.
// This could be due to reaching a limit such as the ones
// imposed by otelcol.processor.memory_limiter.
alert.newRule(
'OtelcolReceiverRefusedSpans',
'sum by (cluster, namespace) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0',
'The receiver could not push some spans to the pipeline.',
'5m',
),
{
newOpenTelemetryAlertsGroup(enableK8sCluster=true):
alert.newGroup(
'alloy_otelcol',
[
// An otelcol.exporter component rcould not push some spans to the pipeline.
// This could be due to reaching a limit such as the ones
// imposed by otelcol.processor.memory_limiter.
alert.newRule(
'OtelcolReceiverRefusedSpans',
if enableK8sCluster then
'sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0'
else
'sum by (job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0'
,
'The receiver could not push some spans to the pipeline.',
'5m',
),

// The exporter failed to send spans to their destination.
// There could be an issue with the payload or with the destination endpoint.
alert.newRule(
'OtelcolExporterFailedSpans',
'sum by (cluster, namespace) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0',
'The exporter failed to send spans to their destination.',
'5m',
),
]
)
// The exporter failed to send spans to their destination.
// There could be an issue with the payload or with the destination endpoint.
alert.newRule(
'OtelcolExporterFailedSpans',
if enableK8sCluster then
'sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0'
else
'sum by (job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0'
,
'The exporter failed to send spans to their destination.',
'5m',
),
]
)
}
12 changes: 12 additions & 0 deletions operations/alloy-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
_config+:: {
enableK8sCluster: true,
enableAlloyCluster: true,
enableLokiLogs: true,
filterSelector: 'job=~"$job"',
groupSelector: if self.enableK8sCluster then self.k8sClusterSelector + ', ' + self.filterSelector else self.filterSelector,
instanceSelector: self.groupSelector + ', instance=~"$instance"',
k8sClusterSelector: 'cluster=~"$cluster", namespace=~"$namespace"',
dashboardTag: 'alloy-mixin'
}
}
28 changes: 20 additions & 8 deletions operations/alloy-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
{
grafanaDashboards+:
(import './dashboards/controller.libsonnet') +
(import './dashboards/resources.libsonnet') +
(import './dashboards/prometheus.libsonnet') +
(import './dashboards/cluster-node.libsonnet') +
(import './dashboards/opentelemetry.libsonnet') +
(import './dashboards/cluster-overview.libsonnet'),
local alloyClusterDashboards =
(import './dashboards/cluster-node.libsonnet') +
(import './dashboards/cluster-overview.libsonnet') +
(import './config.libsonnet');

local otherDashboards =
(import './dashboards/resources.libsonnet') +
(import './dashboards/controller.libsonnet') +
(import './dashboards/prometheus.libsonnet') +
(import './dashboards/opentelemetry.libsonnet') +
(import './config.libsonnet');

(import './dashboards/alloy-logs.libsonnet') +
{
grafanaDashboards+:
if $._config.enableAlloyCluster then
alloyClusterDashboards +
otherDashboards
else
otherDashboards
}
35 changes: 35 additions & 0 deletions operations/alloy-mixin/dashboards/alloy-logs.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet';
local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet';

{

local labels = if $._config.enableK8sCluster then ['cluster', 'namespace', 'job', 'instance', 'level'] else ['job', 'instance', 'level'],

grafanaDashboards+:
if $._config.enableLokiLogs then {
local alloyLogs =
logsDashboard.new(
'Alloy logs overview',
datasourceName='loki_datasource',
datasourceRegex='',
filterSelector=$._config.filterSelector,
labels=labels,
formatParser=null,
showLogsVolume=true
)
{
panels+:
{
logs+:
// Alloy logs already have timestamp
g.panel.logs.options.withShowTime(false),
},
dashboards+:
{
logs+: g.dashboard.withLinksMixin($.grafanaDashboards['alloy-resources.json'].links)
+ g.dashboard.withRefresh('10s'),
},
},
'alloy-logs.json': alloyLogs.dashboards.logs,
} else {},
}
Loading

0 comments on commit a30cc8a

Please sign in to comment.