diff options
-rw-r--r-- | monitoring/ceph-mixin/prometheus_alerts.libsonnet | 10 | ||||
-rw-r--r-- | monitoring/ceph-mixin/prometheus_alerts.yml | 9 | ||||
-rw-r--r-- | monitoring/ceph-mixin/tests_alerts/test_alerts.yml | 48 |
3 files changed, 67 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index 143e65f20e7..5d1ab49b533 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -856,6 +856,16 @@ }, }, { + alert: 'NVMeoFMultipleNamespacesOfRBDImage', + 'for': '1m', + expr: 'count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace ', + description: 'Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups.', + }, + }, + { alert: 'NVMeoFTooManyGateways', 'for': '1m', expr: 'count(ceph_nvmeof_gateway_info) by (cluster) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster], diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 3eb8a8db4fa..3440d761351 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -765,6 +765,15 @@ groups: labels: severity: "warning" type: "ceph_default" + - alert: "NVMeoFMultipleNamespacesOfRBDImage" + annotations: + description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups." + summary: "RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace " + expr: "count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" - alert: "NVMeoFTooManyGateways" annotations: description: "You may create many gateways, but 4 is the tested limit" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index a4e63bbcf73..b3b29308d08 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2270,6 +2270,54 @@ tests: summary: "wah subsystem has reached its maximum number of namespaces on cluster mycluster" description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah" +# NVMeoFMultipleNamespacesOfRBDImage + - interval: 1m + input_series: + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage2"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage2"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev4", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' # bdev with no ns + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm1", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm2", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm1", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm2", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm1", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm2", cluster="mycluster"}' + values: '1x10' + promql_expr_test: + - expr: count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1 + eval_time: 1m + exp_samples: + - labels: '{pool_name="mypool", rbd_name="myimage1"}' + value: 2 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFMultipleNamespacesOfRBDImage + exp_alerts: + - exp_labels: + pool_name: mypool + rbd_name: myimage1 + severity: warning + type: ceph_default + exp_annotations: + summary: "RBD image mypool/myimage1 cannot be reused for multiple NVMeoF namespace " + description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups." + # NVMeoFTooManyGateways - interval: 1m input_series: |