diff options
author | Guillaume Abrioux <gabrioux@ibm.com> | 2023-10-04 12:00:26 +0200 |
---|---|---|
committer | Guillaume Abrioux <gabrioux@ibm.com> | 2024-01-25 15:43:30 +0100 |
commit | 76d8e0bbbf2c5130a325943ffe09791cbd4f2feb (patch) | |
tree | 3488ac9fc5af4143118b03a93a50a1e58e632da0 /monitoring/ceph-mixin | |
parent | node-proxy: validate_node_proxy_data() refactor (diff) | |
download | ceph-76d8e0bbbf2c5130a325943ffe09791cbd4f2feb.tar.xz ceph-76d8e0bbbf2c5130a325943ffe09791cbd4f2feb.zip |
monitoring: add new alerts
This adds new hardware monitoring alerts.
Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
Diffstat (limited to 'monitoring/ceph-mixin')
-rw-r--r-- | monitoring/ceph-mixin/prometheus_alerts.libsonnet | 65 | ||||
-rw-r--r-- | monitoring/ceph-mixin/prometheus_alerts.yml | 62 | ||||
-rw-r--r-- | monitoring/ceph-mixin/tests_alerts/test_alerts.yml | 144 |
3 files changed, 271 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index b90573be6c0..b5fed4fe7cf 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -690,6 +690,71 @@ ], }, { + name: 'hardware', + rules: [ + { + alert: 'HardwareStorageError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_STORAGE"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.1' }, + annotations: { + summary: 'Storage devices error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Some storage devices are in error. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareMemoryError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_MEMORY"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.2' }, + annotations: { + summary: 'DIMM error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'DIMM error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareProcessorError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.3' }, + annotations: { + summary: 'Processor error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Processor error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareNetworkError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_NETWORK"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.4' }, + annotations: { + summary: 'Network error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Network error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwarePowerError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_POWER"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.5' }, + annotations: { + summary: 'Power supply error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Power supply error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareFanError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_FANS"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.6' }, + annotations: { + summary: 'Fan error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Fan error(s) detected. Check `ceph health detail`.', + }, + }, + ], + }, + { name: 'PrometheusServer', rules: [ { diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index b549d6bf49a..5b011aa3568 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -614,6 +614,68 @@ groups: labels: severity: "warning" type: "ceph_default" + - name: "hardware" + rules: + - alert: "HardwareStorageError" + annotations: + description: "Some storage devices are in error. Check `ceph health detail`." + summary: "Storage devices error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.1" + severity: "critical" + type: "ceph_default" + - alert: "HardwareMemoryError" + annotations: + description: "DIMM error(s) detected. Check `ceph health detail`." + summary: "DIMM error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.2" + severity: "critical" + type: "ceph_default" + - alert: "HardwareProcessorError" + annotations: + description: "Processor error(s) detected. Check `ceph health detail`." + summary: "Processor error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.3" + severity: "critical" + type: "ceph_default" + - alert: "HardwareNetworkError" + annotations: + description: "Network error(s) detected. Check `ceph health detail`." + summary: "Network error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.4" + severity: "critical" + type: "ceph_default" + - alert: "HardwarePowerError" + annotations: + description: "Power supply error(s) detected. Check `ceph health detail`." + summary: "Power supply error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.5" + severity: "critical" + type: "ceph_default" + - alert: "HardwareFanError" + annotations: + description: "Fan error(s) detected. Check `ceph health detail`." + summary: "Fan error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.6" + severity: "critical" + type: "ceph_default" - name: "PrometheusServer" rules: - alert: "PrometheusJobMissing" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 9eaa096d5db..9cb688e9ca0 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2075,3 +2075,147 @@ tests: description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously" summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes" + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_STORAGE"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_STORAGE"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareStorageError + - eval_time: 5m + alertname: HardwareStorageError + exp_alerts: + - exp_labels: + name: HARDWARE_STORAGE + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.1 + exp_annotations: + summary: Storage devices error(s) detected + description: "Some storage devices are in error. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_MEMORY"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_MEMORY"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareMemoryError + - eval_time: 5m + alertname: HardwareMemoryError + exp_alerts: + - exp_labels: + name: HARDWARE_MEMORY + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.2 + exp_annotations: + summary: DIMM error(s) detected + description: "DIMM error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_PROCESSOR"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_PROCESSOR"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareProcessorError + - eval_time: 5m + alertname: HardwareProcessorError + exp_alerts: + - exp_labels: + name: HARDWARE_PROCESSOR + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.3 + exp_annotations: + summary: Processor error(s) detected + description: "Processor error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_NETWORK"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_NETWORK"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareNetworkError + - eval_time: 5m + alertname: HardwareNetworkError + exp_alerts: + - exp_labels: + name: HARDWARE_NETWORK + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.4 + exp_annotations: + summary: Network error(s) detected + description: "Network error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_POWER"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_POWER"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_POWER"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwarePowerError + - eval_time: 5m + alertname: HardwarePowerError + exp_alerts: + - exp_labels: + name: HARDWARE_POWER + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.5 + exp_annotations: + summary: Power supply error(s) detected + description: "Power supply error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_FANS"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_FANS"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_FANS"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareFanError + - eval_time: 5m + alertname: HardwareFanError + exp_alerts: + - exp_labels: + name: HARDWARE_FANS + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.6 + exp_annotations: + summary: Fan error(s) detected + description: "Fan error(s) detected. Check `ceph health detail`." |