diff options
author | afreen23 <afreen23.git@gmail.com> | 2024-12-20 14:16:48 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-12-20 14:16:48 +0100 |
commit | 919c2a6af6bf6c928adfc63901652f24eba1fa5e (patch) | |
tree | 375a1e361fe2f1f0483edaaad7aaaab5a2cbe981 | |
parent | Merge pull request #61146 from zdover23/wip-doc-2024-12-19-radosgw-uadk-accel (diff) | |
parent | monitoring: Add prometheus alert NVMeoFMultipleNamespacesOfRBDImage (diff) | |
download | ceph-919c2a6af6bf6c928adfc63901652f24eba1fa5e.tar.xz ceph-919c2a6af6bf6c928adfc63901652f24eba1fa5e.zip |
Merge pull request #60777 from VallariAg/wip-nvmeof-prometheus-rbd-image-reused
monitoring: Add prometheus alert NVMeoFMultipleNamespacesOfRBDImage
Reviewed-by: Afreen Misbah <afreen@ibm.com>
-rw-r--r-- | monitoring/ceph-mixin/prometheus_alerts.libsonnet | 10 | ||||
-rw-r--r-- | monitoring/ceph-mixin/prometheus_alerts.yml | 9 | ||||
-rw-r--r-- | monitoring/ceph-mixin/tests_alerts/test_alerts.yml | 48 |
3 files changed, 67 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index 143e65f20e7..5d1ab49b533 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -856,6 +856,16 @@ }, }, { + alert: 'NVMeoFMultipleNamespacesOfRBDImage', + 'for': '1m', + expr: 'count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace ', + description: 'Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups.', + }, + }, + { alert: 'NVMeoFTooManyGateways', 'for': '1m', expr: 'count(ceph_nvmeof_gateway_info) by (cluster) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster], diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 3eb8a8db4fa..3440d761351 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -765,6 +765,15 @@ groups: labels: severity: "warning" type: "ceph_default" + - alert: "NVMeoFMultipleNamespacesOfRBDImage" + annotations: + description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups." + summary: "RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace " + expr: "count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" - alert: "NVMeoFTooManyGateways" annotations: description: "You may create many gateways, but 4 is the tested limit" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index a4e63bbcf73..b3b29308d08 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2270,6 +2270,54 @@ tests: summary: "wah subsystem has reached its maximum number of namespaces on cluster mycluster" description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah" +# NVMeoFMultipleNamespacesOfRBDImage + - interval: 1m + input_series: + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage2"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage2"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev4", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' # bdev with no ns + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm1", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm2", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm1", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm2", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm1", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm2", cluster="mycluster"}' + values: '1x10' + promql_expr_test: + - expr: count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1 + eval_time: 1m + exp_samples: + - labels: '{pool_name="mypool", rbd_name="myimage1"}' + value: 2 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFMultipleNamespacesOfRBDImage + exp_alerts: + - exp_labels: + pool_name: mypool + rbd_name: myimage1 + severity: warning + type: ceph_default + exp_annotations: + summary: "RBD image mypool/myimage1 cannot be reused for multiple NVMeoF namespace " + description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups." + # NVMeoFTooManyGateways - interval: 1m input_series: |