Skip to content

Commit

Permalink
Merge pull request #82 from w3f/test-endpoint
Browse files Browse the repository at this point in the history
alert now has to be triggered manually
  • Loading branch information
ironoa authored Dec 17, 2021
2 parents 4677a7d + c8878be commit 464eead
Show file tree
Hide file tree
Showing 14 changed files with 145 additions and 110 deletions.
21 changes: 11 additions & 10 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ jobs:
unitTest:
docker:
- image: web3f/node-dind:v1.0.0
- image: parity/polkadot:v0.9.12
- image: parity/polkadot:v0.9.13
name: polkadot
command: --chain=kusama-dev --ws-port 11000 --alice --ws-external --rpc-methods=Unsafe --rpc-cors=all
steps:
Expand All @@ -23,7 +23,7 @@ jobs:

helmLint:
docker:
- image: web3f/ci-commons:v2.4.8
- image: web3f/ci-commons:v2.4.11
steps:
- checkout
- run:
Expand All @@ -32,7 +32,7 @@ jobs:
buildImage:
docker:
- image: web3f/ci-commons:v2.4.8
- image: web3f/ci-commons:v2.4.11
steps:
- checkout
- setup_remote_docker:
Expand All @@ -44,7 +44,7 @@ jobs:
integrationTests:
docker:
- image: web3f/ci-commons:v2.4.8
- image: web3f/ci-commons:v2.4.11
steps:
- checkout
- setup_remote_docker
Expand All @@ -55,17 +55,18 @@ jobs:
testPrometheusRules:
docker:
- image: web3f/ci-commons:v2.4.8
- image: web3f/ci-commons:v2.4.11
steps:
- checkout
- run:
name: Install missing dependencies
command: |
YQ_VER=3.4.1
wget -O /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/${YQ_VER}/yq_linux_amd64
YQ_VER=4.16.1
wget -O /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v${YQ_VER}/yq_linux_amd64
chmod +x /usr/local/bin/yq
PROM_VER=2.30.1
PROM_VER=2.31.1
wget -O /tmp/prometheus.tgz https://github.com/prometheus/prometheus/releases/download/v${PROM_VER}/prometheus-${PROM_VER}.linux-amd64.tar.gz
tar -xvf /tmp/prometheus.tgz prometheus-${PROM_VER}.linux-amd64/promtool -C /tmp
mv /tmp/prometheus-$PROM_VER.linux-amd64/promtool /usr/local/bin/
Expand All @@ -76,7 +77,7 @@ jobs:
publishImage:
docker:
- image: web3f/ci-commons:v2.4.8
- image: web3f/ci-commons:v2.4.11
steps:
- checkout
- setup_remote_docker
Expand All @@ -86,7 +87,7 @@ jobs:
publishChart:
docker:
- image: web3f/ci-commons:v2.4.8
- image: web3f/ci-commons:v2.4.11
steps:
- checkout
- run:
Expand Down
2 changes: 1 addition & 1 deletion charts/polkadot-watcher/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
description: Polkadot Watcher
name: polkadot-watcher
version: v3.1.2
version: v3.1.3
apiVersion: v2
16 changes: 8 additions & 8 deletions charts/polkadot-watcher/templates/alertrules.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{ if ne .Values.environment "ci" }}
{{ if and .Values.prometheusRules.enabled ( ne .Values.environment "ci" ) }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -17,7 +17,7 @@ spec:
for: 60m
labels:
severity: warning
origin: {{ .Values.origin }}
origin: {{ .Values.prometheusRules.origin }}
- alert: ProducerStallLong
annotations:
message: 'Blocks were not produced for 3 hours by {{`{{ $labels.name }}`}}'
Expand All @@ -26,16 +26,16 @@ spec:
for: 180m
labels:
severity: critical
origin: {{ .Values.origin }}
origin: {{ .Values.prometheusRules.origin }}
- alert: ValidatorOffline
annotations:
message: '{{`{{ $labels.name }}`}} was reported offline, check https://polkascan.io/pre/{{ .Values.networkId }}/event?module=imonline&event=SomeOffline&page=1 for details'
message: '{{`{{ $labels.name }}`}} was reported offline, check https://{{`{{ $labels.network }}`}}.subscan.io/event?address=&module=imonline&event=someoffline for details. Check his account for eventul slahes: https://{{`{{ $labels.network }}`}}.subscan.io/account/{{`{{ $labels.address }}`}}?tab=reward'
runbook_url: "https://github.com/w3f/infrastructure/wiki/Validator-Offline"
expr: polkadot_offline_validator_reports_total > 0
for: 30s
labels:
severity: critical
origin: {{ .Values.origin }}
origin: {{ .Values.prometheusRules.origin }}
{{ if ne .Values.prometheusRules.heartbeat false }}
- alert: ValidatorOfflineSessionLong
annotations:
Expand All @@ -44,15 +44,15 @@ spec:
for: 10m
labels:
severity: critical
origin: {{ .Values.origin }}
origin: {{ .Values.prometheusRules.origin }}
- alert: ValidatorOfflineSessionShort
annotations:
message: 'Target {{`{{ $labels.name }}`}} has either not authored any block or sent any heartbeat yet in this session'
expr: polkadot_offline_validator_session_reports_state > 0
for: 8m
labels:
severity: warning
origin: {{ .Values.origin }}
origin: {{ .Values.prometheusRules.origin }}
{{ end }}
- alert: ValidatorOutOfActiveSet
annotations:
Expand All @@ -61,6 +61,6 @@ spec:
for: 2m
labels:
severity: info
origin: {{ .Values.origin }}
origin: {{ .Values.prometheusRules.origin }}

{{ end }}
2 changes: 1 addition & 1 deletion charts/polkadot-watcher/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{ if ne .Values.environment "ci" }}
{{ if and .Values.serviceMonitor.enabled ( ne .Values.environment "ci" ) }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
Expand Down
7 changes: 4 additions & 3 deletions charts/polkadot-watcher/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ port: 3000

image:
repo: web3f/polkadot-watcher
tag: v3.1.2
tag: v3.1.3

config:
endpoint: "wss://kusama-rpc.polkadot.io"
Expand All @@ -12,19 +12,20 @@ config:
validators: []

serviceMonitor:
enabled: true #to be enabled for each instance
labels:
group: w3f
release: prometheus-operator

prometheusRules:
enabled: false #if you have multiple instances, enabled it just once to avoid duplicated alerts
labels:
app: w3f
origin: cluster
heartbeat: true

networkId: kusama

origin: cluster

resources:
requests:
cpu: "100m"
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "polkadot-watcher",
"version": "3.1.2",
"version": "3.1.3",
"description": "Monitor events on Polkadot networks",
"repository": "git@github.com:w3f/polkadot-watcher.git",
"author": "W3F Infrastructure Team <devops@web3.foundation>",
Expand Down
4 changes: 2 additions & 2 deletions scripts/test_prometheus_rules.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ CHART_NAME="polkadot-watcher"
for file in $(find "${TEST_FOLDER}" -name *yaml | sed 's|^'"${TEST_FOLDER}"/'||'); do
template_name=${file##*/}

helm template --set monitoring=true -s "templates/${template_name}" "charts/${CHART_NAME}" | yq r - spec | promtool check rules /dev/stdin
helm template --set monitoring=true -s "templates/${template_name}" "charts/${CHART_NAME}" | yq r - spec | promtool test rules "${TEST_FOLDER}/${template_name}"
helm template --set prometheusRules.enabled=true -s "templates/${template_name}" "charts/${CHART_NAME}" | yq e '.spec' - | promtool check rules /dev/stdin
helm template --set prometheusRules.enabled=true -s "templates/${template_name}" "charts/${CHART_NAME}" | yq e '.spec' - | promtool test rules "${TEST_FOLDER}/${template_name}"
done
19 changes: 17 additions & 2 deletions src/actions/start.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,16 @@ import { Config } from '@w3f/config';
import { Subscriber } from '../subscriber';
import { Prometheus } from '../prometheus';
import { InputConfig } from '../types';
import { Client } from '../client';

const _addTestEndpoint = (server: express.Application, subscriber: Subscriber): void =>{

server.get('/test',
async (_req: express.Request, res: express.Response): Promise<void> => {
subscriber.triggerConnectivityTest()
res.status(200).send('A test alert should fire and then resolve...')
})
}

export async function startAction(cmd): Promise<void> {
const cfg = new Config<InputConfig>().parse(cmd.config);
Expand All @@ -19,10 +28,16 @@ export async function startAction(cmd): Promise<void> {

const logger = createLogger(cfg.logLevel);

const promClient = new Prometheus(logger);
const api = await new Client(cfg,logger).connect()
const chain = await api.rpc.system.chain()
const networkId = chain.toString().toLowerCase()

const promClient = new Prometheus(networkId,logger);
promClient.injectMetricsRoute(server);
promClient.startCollection();

const subscriber = new Subscriber(cfg, promClient, logger);
const subscriber = new Subscriber(cfg, api, promClient, logger);
await subscriber.start();

_addTestEndpoint(server,subscriber)
}
51 changes: 51 additions & 0 deletions src/client.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import { ApiPromise, WsProvider } from '@polkadot/api';
import { Logger } from '@w3f/logger';

import {
InputConfig,
} from './types';

export class Client {
private api: ApiPromise;
private endpoint: string;

constructor(
cfg: InputConfig,
private readonly logger: Logger) {
this.endpoint = cfg.endpoint;
}

public async connect(): Promise<ApiPromise> {
try {
await this._initAPI();
} catch (error) {
this.logger.error("initAPI error... exiting: "+JSON.stringify(error))
process.exit(1)
}
return this.api
}

private async _initAPI(): Promise<void> {
const provider = new WsProvider(this.endpoint);
this.api = new ApiPromise({provider})
if(this.api){
this.api.on("error", error => {
if( error.toString().includes("FATAL") || JSON.stringify(error).includes("FATAL") ){
this.logger.error("The API had a FATAL error... exiting!")
process.exit(1)
}
})
}
await this.api.isReadyOrError;

const [chain, nodeName, nodeVersion] = await Promise.all([
this.api.rpc.system.chain(),
this.api.rpc.system.name(),
this.api.rpc.system.version()
]);
this.logger.info(
`You are connected to chain ${chain} using ${nodeName} v${nodeVersion}`
);
}

}
36 changes: 18 additions & 18 deletions src/prometheus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ export class Prometheus implements PromClient {
private stateValidatorOfflineSessionReports: promClient.Gauge;
private stateValidatorOutOfActiveSetReports: promClient.Gauge;

constructor(private readonly logger: Logger) {
constructor(private readonly network: string, private readonly logger: Logger) {
this._initMetrics()
}

Expand All @@ -32,64 +32,64 @@ export class Prometheus implements PromClient {
})
}

increaseTotalBlocksProduced(name: string, account: string): void {
this.totalBlocksProduced.inc({ name, account })
increaseTotalBlocksProduced(name: string, address: string): void {
this.totalBlocksProduced.inc({network:this.network, name, address })
}

initTotalBlocksProduced(name: string, account: string): void {
this.totalBlocksProduced.inc({ name, account },0)
initTotalBlocksProduced(name: string, address: string): void {
this.totalBlocksProduced.inc({network:this.network, name, address },0)
}

/* condition where you have been reported offline */
increaseTotalValidatorOfflineReports(name: string): void {
this.totalValidatorOfflineReports.inc({ name });
increaseTotalValidatorOfflineReports(name: string, address: string): void {
this.totalValidatorOfflineReports.inc({network:this.network, name, address });
}

resetTotalValidatorOfflineReports(name: string): void {
this.totalValidatorOfflineReports.set({ name }, 0);
resetTotalValidatorOfflineReports(name: string, address: string): void {
this.totalValidatorOfflineReports.set({network:this.network, name, address }, 0);
}

/* condition where you are risking to be reported as offline */
setStatusValidatorOffline(name: string): void {
this.stateValidatorOfflineSessionReports.set({ name }, 1);
this.stateValidatorOfflineSessionReports.set({network:this.network, name }, 1);
}

resetStatusValidatorOffline(name: string): void {
this.stateValidatorOfflineSessionReports.set({ name }, 0);
this.stateValidatorOfflineSessionReports.set({network:this.network, name }, 0);
}

isValidatorStatusOffline(name: string): boolean {
return promClient.register.getSingleMetric(Prometheus.nameValidatorOfflineSessionMetric)['hashMap']['name:'+name]['value'] === 1
return promClient.register.getSingleMetric(Prometheus.nameValidatorOfflineSessionMetric)['hashMap'][`name:${name},network:${this.network}`]['value'] === 1
}

setStatusValidatorOutOfActiveSet(name: string): void{
this.stateValidatorOutOfActiveSetReports.set({ name }, 1);
this.stateValidatorOutOfActiveSetReports.set({network:this.network, name }, 1);
}

resetStatusValidatorOutOfActiveSet(name: string): void{
this.stateValidatorOutOfActiveSetReports.set({ name }, 0);
this.stateValidatorOutOfActiveSetReports.set({network:this.network, name }, 0);
}

_initMetrics(): void {
this.totalBlocksProduced = new promClient.Counter({
name: 'polkadot_blocks_produced_total',
help: 'Total number of blocks produced by a validator',
labelNames: ['name', 'account']
labelNames: ['network', 'name', 'address']
});
this.totalValidatorOfflineReports = new promClient.Gauge({
name: 'polkadot_offline_validator_reports_total',
help: 'Total times a validator has been reported offline',
labelNames: ['name']
labelNames: ['network', 'name', 'address']
});
this.stateValidatorOfflineSessionReports = new promClient.Gauge({
name: Prometheus.nameValidatorOfflineSessionMetric,
help: 'Whether a validator is reported as offline in the current session',
labelNames: ['name']
labelNames: ['network', 'name']
});
this.stateValidatorOutOfActiveSetReports = new promClient.Gauge({
name: 'polkadot_validator_out_of_active_set_reports_state',
help: 'Whether a validator is reported as outside of the current Era validators active set',
labelNames: ['name']
labelNames: ['network', 'name']
});
}
}
Loading

0 comments on commit 464eead

Please sign in to comment.