summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorafreen23 <afreen23.git@gmail.com>2024-12-20 14:16:48 +0100
committerGitHub <noreply@github.com>2024-12-20 14:16:48 +0100
commit919c2a6af6bf6c928adfc63901652f24eba1fa5e (patch)
tree375a1e361fe2f1f0483edaaad7aaaab5a2cbe981
parentMerge pull request #61146 from zdover23/wip-doc-2024-12-19-radosgw-uadk-accel (diff)
parentmonitoring: Add prometheus alert NVMeoFMultipleNamespacesOfRBDImage (diff)
downloadceph-919c2a6af6bf6c928adfc63901652f24eba1fa5e.tar.xz
ceph-919c2a6af6bf6c928adfc63901652f24eba1fa5e.zip
Merge pull request #60777 from VallariAg/wip-nvmeof-prometheus-rbd-image-reused
monitoring: Add prometheus alert NVMeoFMultipleNamespacesOfRBDImage Reviewed-by: Afreen Misbah <afreen@ibm.com>
-rw-r--r--monitoring/ceph-mixin/prometheus_alerts.libsonnet10
-rw-r--r--monitoring/ceph-mixin/prometheus_alerts.yml9
-rw-r--r--monitoring/ceph-mixin/tests_alerts/test_alerts.yml48
3 files changed, 67 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
index 143e65f20e7..5d1ab49b533 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -856,6 +856,16 @@
},
},
{
+ alert: 'NVMeoFMultipleNamespacesOfRBDImage',
+ 'for': '1m',
+ expr: 'count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1',
+ labels: { severity: 'warning', type: 'ceph_default' },
+ annotations: {
+ summary: 'RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace ',
+ description: 'Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups.',
+ },
+ },
+ {
alert: 'NVMeoFTooManyGateways',
'for': '1m',
expr: 'count(ceph_nvmeof_gateway_info) by (cluster) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster],
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index 3eb8a8db4fa..3440d761351 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -765,6 +765,15 @@ groups:
labels:
severity: "warning"
type: "ceph_default"
+ - alert: "NVMeoFMultipleNamespacesOfRBDImage"
+ annotations:
+ description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups."
+ summary: "RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace "
+ expr: "count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1"
+ for: "1m"
+ labels:
+ severity: "warning"
+ type: "ceph_default"
- alert: "NVMeoFTooManyGateways"
annotations:
description: "You may create many gateways, but 4 is the tested limit"
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index a4e63bbcf73..b3b29308d08 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -2270,6 +2270,54 @@ tests:
summary: "wah subsystem has reached its maximum number of namespaces on cluster mycluster"
description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah"
+# NVMeoFMultipleNamespacesOfRBDImage
+ - interval: 1m
+ input_series:
+ - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage2"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage2"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev4", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' # bdev with no ns
+ values: '1x10'
+ - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm1", cluster="mycluster"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm2", cluster="mycluster"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm1", cluster="mycluster"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm2", cluster="mycluster"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm1", cluster="mycluster"}'
+ values: '1x10'
+ - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm2", cluster="mycluster"}'
+ values: '1x10'
+ promql_expr_test:
+ - expr: count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1
+ eval_time: 1m
+ exp_samples:
+ - labels: '{pool_name="mypool", rbd_name="myimage1"}'
+ value: 2
+ alert_rule_test:
+ - eval_time: 5m
+ alertname: NVMeoFMultipleNamespacesOfRBDImage
+ exp_alerts:
+ - exp_labels:
+ pool_name: mypool
+ rbd_name: myimage1
+ severity: warning
+ type: ceph_default
+ exp_annotations:
+ summary: "RBD image mypool/myimage1 cannot be reused for multiple NVMeoF namespace "
+ description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups."
+
# NVMeoFTooManyGateways
- interval: 1m
input_series: