summaryrefslogtreecommitdiffstats
path: root/monitoring/ceph-mixin/prometheus_alerts.yml
diff options
context:
space:
mode:
Diffstat (limited to 'monitoring/ceph-mixin/prometheus_alerts.yml')
-rw-r--r--monitoring/ceph-mixin/prometheus_alerts.yml50
1 files changed, 43 insertions, 7 deletions
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index ba6a6ded0a3..7c0da4d51a4 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -765,20 +765,29 @@ groups:
labels:
severity: "warning"
type: "ceph_default"
+ - alert: "NVMeoFMultipleNamespacesOfRBDImage"
+ annotations:
+ description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups."
+ summary: "RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace "
+ expr: "count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1"
+ for: "1m"
+ labels:
+ severity: "warning"
+ type: "ceph_default"
- alert: "NVMeoFTooManyGateways"
annotations:
- description: "You may create many gateways, but 4 is the tested limit"
+ description: "You may create many gateways, but 32 is the tested limit"
summary: "Max supported gateways exceeded on cluster {{ $labels.cluster }}"
- expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 4.00"
+ expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 32.00"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "NVMeoFMaxGatewayGroupSize"
annotations:
- description: "You may create many gateways in a gateway group, but 4 is the tested limit"
+ description: "You may create many gateways in a gateway group, but 8 is the tested limit"
summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}"
- expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00"
+ expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00"
for: "1m"
labels:
severity: "warning"
@@ -814,7 +823,16 @@ groups:
annotations:
description: "Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported"
summary: "The number of subsystems defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}"
- expr: "count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*):.*\")) > 16.00"
+ expr: "count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 128.00"
+ for: "1m"
+ labels:
+ severity: "warning"
+ type: "ceph_default"
+ - alert: "NVMeoFTooManyNamespaces"
+ annotations:
+ description: "Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported"
+ summary: "The number of namespaces defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}"
+ expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 2048.00"
for: "1m"
labels:
severity: "warning"
@@ -830,13 +848,31 @@ groups:
type: "ceph_default"
- alert: "NVMeoFHighClientCount"
annotations:
- description: "The supported limit for clients connecting to a subsystem is 32"
+ description: "The supported limit for clients connecting to a subsystem is 128"
summary: "The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }}"
- expr: "ceph_nvmeof_subsystem_host_count > 32.00"
+ expr: "ceph_nvmeof_subsystem_host_count > 128.00"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
+ - alert: "NVMeoFMissingListener"
+ annotations:
+ description: "For every subsystem, each gateway should have a listener to balance traffic between gateways."
+ summary: "No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{ $labels.nqn }} subsystem"
+ expr: "ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0"
+ for: "10m"
+ labels:
+ severity: "warning"
+ type: "ceph_default"
+ - alert: "NVMeoFZeroListenerSubsystem"
+ annotations:
+ description: "NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners."
+ summary: "No listeners added to {{ $labels.nqn }} subsystem"
+ expr: "sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0"
+ for: "10m"
+ labels:
+ severity: "warning"
+ type: "ceph_default"
- alert: "NVMeoFHighHostCPU"
annotations:
description: "High CPU on a gateway host can lead to CPU contention and performance degradation"