summaryrefslogtreecommitdiffstats
path: root/monitoring/ceph-mixin
diff options
context:
space:
mode:
authorGuillaume Abrioux <gabrioux@ibm.com>2023-10-04 12:00:26 +0200
committerGuillaume Abrioux <gabrioux@ibm.com>2024-01-25 15:43:30 +0100
commit76d8e0bbbf2c5130a325943ffe09791cbd4f2feb (patch)
tree3488ac9fc5af4143118b03a93a50a1e58e632da0 /monitoring/ceph-mixin
parentnode-proxy: validate_node_proxy_data() refactor (diff)
downloadceph-76d8e0bbbf2c5130a325943ffe09791cbd4f2feb.tar.xz
ceph-76d8e0bbbf2c5130a325943ffe09791cbd4f2feb.zip
monitoring: add new alerts
This adds new hardware monitoring alerts. Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
Diffstat (limited to 'monitoring/ceph-mixin')
-rw-r--r--monitoring/ceph-mixin/prometheus_alerts.libsonnet65
-rw-r--r--monitoring/ceph-mixin/prometheus_alerts.yml62
-rw-r--r--monitoring/ceph-mixin/tests_alerts/test_alerts.yml144
3 files changed, 271 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
index b90573be6c0..b5fed4fe7cf 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -690,6 +690,71 @@
],
},
{
+ name: 'hardware',
+ rules: [
+ {
+ alert: 'HardwareStorageError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_STORAGE"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.1' },
+ annotations: {
+ summary: 'Storage devices error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'Some storage devices are in error. Check `ceph health detail`.',
+ },
+ },
+ {
+ alert: 'HardwareMemoryError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_MEMORY"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.2' },
+ annotations: {
+ summary: 'DIMM error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'DIMM error(s) detected. Check `ceph health detail`.',
+ },
+ },
+ {
+ alert: 'HardwareProcessorError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.3' },
+ annotations: {
+ summary: 'Processor error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'Processor error(s) detected. Check `ceph health detail`.',
+ },
+ },
+ {
+ alert: 'HardwareNetworkError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_NETWORK"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.4' },
+ annotations: {
+ summary: 'Network error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'Network error(s) detected. Check `ceph health detail`.',
+ },
+ },
+ {
+ alert: 'HardwarePowerError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_POWER"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.5' },
+ annotations: {
+ summary: 'Power supply error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'Power supply error(s) detected. Check `ceph health detail`.',
+ },
+ },
+ {
+ alert: 'HardwareFanError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_FANS"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.6' },
+ annotations: {
+ summary: 'Fan error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'Fan error(s) detected. Check `ceph health detail`.',
+ },
+ },
+ ],
+ },
+ {
name: 'PrometheusServer',
rules: [
{
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index b549d6bf49a..5b011aa3568 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -614,6 +614,68 @@ groups:
labels:
severity: "warning"
type: "ceph_default"
+ - name: "hardware"
+ rules:
+ - alert: "HardwareStorageError"
+ annotations:
+ description: "Some storage devices are in error. Check `ceph health detail`."
+ summary: "Storage devices error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.1"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "HardwareMemoryError"
+ annotations:
+ description: "DIMM error(s) detected. Check `ceph health detail`."
+ summary: "DIMM error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.2"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "HardwareProcessorError"
+ annotations:
+ description: "Processor error(s) detected. Check `ceph health detail`."
+ summary: "Processor error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.3"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "HardwareNetworkError"
+ annotations:
+ description: "Network error(s) detected. Check `ceph health detail`."
+ summary: "Network error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.4"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "HardwarePowerError"
+ annotations:
+ description: "Power supply error(s) detected. Check `ceph health detail`."
+ summary: "Power supply error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.5"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "HardwareFanError"
+ annotations:
+ description: "Fan error(s) detected. Check `ceph health detail`."
+ summary: "Fan error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.6"
+ severity: "critical"
+ type: "ceph_default"
- name: "PrometheusServer"
rules:
- alert: "PrometheusJobMissing"
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index 9eaa096d5db..9cb688e9ca0 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -2075,3 +2075,147 @@ tests:
description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously"
summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_STORAGE"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_STORAGE"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwareStorageError
+ - eval_time: 5m
+ alertname: HardwareStorageError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_STORAGE
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.1
+ exp_annotations:
+ summary: Storage devices error(s) detected
+ description: "Some storage devices are in error. Check `ceph health detail`."
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_MEMORY"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_MEMORY"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwareMemoryError
+ - eval_time: 5m
+ alertname: HardwareMemoryError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_MEMORY
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.2
+ exp_annotations:
+ summary: DIMM error(s) detected
+ description: "DIMM error(s) detected. Check `ceph health detail`."
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_PROCESSOR"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_PROCESSOR"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwareProcessorError
+ - eval_time: 5m
+ alertname: HardwareProcessorError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_PROCESSOR
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.3
+ exp_annotations:
+ summary: Processor error(s) detected
+ description: "Processor error(s) detected. Check `ceph health detail`."
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_NETWORK"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_NETWORK"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwareNetworkError
+ - eval_time: 5m
+ alertname: HardwareNetworkError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_NETWORK
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.4
+ exp_annotations:
+ summary: Network error(s) detected
+ description: "Network error(s) detected. Check `ceph health detail`."
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_POWER"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_POWER"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_POWER"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwarePowerError
+ - eval_time: 5m
+ alertname: HardwarePowerError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_POWER
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.5
+ exp_annotations:
+ summary: Power supply error(s) detected
+ description: "Power supply error(s) detected. Check `ceph health detail`."
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_FANS"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_FANS"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_FANS"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwareFanError
+ - eval_time: 5m
+ alertname: HardwareFanError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_FANS
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.6
+ exp_annotations:
+ summary: Fan error(s) detected
+ description: "Fan error(s) detected. Check `ceph health detail`."