summaryrefslogtreecommitdiffstats
path: root/monitoring
diff options
context:
space:
mode:
authorVallari Agrawal <vallari.agrawal@ibm.com>2025-01-09 11:54:33 +0100
committerVallari Agrawal <vallari.agrawal@ibm.com>2025-01-09 11:59:29 +0100
commitf3c18818b9b937550f503d94ca37f75639db5514 (patch)
tree954bd34cfae3850b4abfffaba7fa5ae0e87c7f43 /monitoring
parentMerge pull request #60891 from xxhdx1985126/wip-seastore-fadvise-backfill (diff)
downloadceph-f3c18818b9b937550f503d94ca37f75639db5514.tar.xz
ceph-f3c18818b9b937550f503d94ca37f75639db5514.zip
monitoring: Update nvmeof alert limits in config
Update these in config.libsonnet: - NVMeoFMaxGatewaysPerGroup (4->8) - NVMeoFMaxGatewaysPerCluster (4->32) - NVMeoFMaxNamespaces (1024->2048) - NVMeoFHighClientCount (32->128) Also update prometheus_alerts.yml and test_alerts.yml accordingly. Signed-off-by: Vallari Agrawal <vallari.agrawal@ibm.com>
Diffstat (limited to 'monitoring')
-rw-r--r--monitoring/ceph-mixin/config.libsonnet8
-rw-r--r--monitoring/ceph-mixin/prometheus_alerts.yml14
-rw-r--r--monitoring/ceph-mixin/tests_alerts/test_alerts.yml93
3 files changed, 91 insertions, 24 deletions
diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet
index a15b88422fc..e917b4c2dac 100644
--- a/monitoring/ceph-mixin/config.libsonnet
+++ b/monitoring/ceph-mixin/config.libsonnet
@@ -9,12 +9,12 @@
CephNodeNetworkPacketDropsPerSec: 10,
CephRBDMirrorImageTransferBandwidthThreshold: 0.8,
CephRBDMirrorImagesPerDaemonThreshold: 100,
- NVMeoFMaxGatewaysPerGroup: 4,
- NVMeoFMaxGatewaysPerCluster: 4,
+ NVMeoFMaxGatewaysPerGroup: 8,
+ NVMeoFMaxGatewaysPerCluster: 32,
NVMeoFHighGatewayCPU: 80,
NVMeoFMaxSubsystemsPerGateway: 128,
- NVMeoFMaxNamespaces: 1024,
- NVMeoFHighClientCount: 32,
+ NVMeoFMaxNamespaces: 2048,
+ NVMeoFHighClientCount: 128,
NVMeoFHighHostCPU: 80,
//
// Read/Write latency is defined in ms
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index 3440d761351..7c0da4d51a4 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -776,18 +776,18 @@ groups:
type: "ceph_default"
- alert: "NVMeoFTooManyGateways"
annotations:
- description: "You may create many gateways, but 4 is the tested limit"
+ description: "You may create many gateways, but 32 is the tested limit"
summary: "Max supported gateways exceeded on cluster {{ $labels.cluster }}"
- expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 4.00"
+ expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 32.00"
for: "1m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "NVMeoFMaxGatewayGroupSize"
annotations:
- description: "You may create many gateways in a gateway group, but 4 is the tested limit"
+ description: "You may create many gateways in a gateway group, but 8 is the tested limit"
summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}"
- expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00"
+ expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00"
for: "1m"
labels:
severity: "warning"
@@ -832,7 +832,7 @@ groups:
annotations:
description: "Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported"
summary: "The number of namespaces defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}"
- expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 1024.00"
+ expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 2048.00"
for: "1m"
labels:
severity: "warning"
@@ -848,9 +848,9 @@ groups:
type: "ceph_default"
- alert: "NVMeoFHighClientCount"
annotations:
- description: "The supported limit for clients connecting to a subsystem is 32"
+ description: "The supported limit for clients connecting to a subsystem is 128"
summary: "The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }}"
- expr: "ceph_nvmeof_subsystem_host_count > 32.00"
+ expr: "ceph_nvmeof_subsystem_host_count > 128.00"
for: "1m"
labels:
severity: "warning"
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index b3b29308d08..83b4ff80375 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -2331,12 +2331,69 @@ tests:
values: '1+0x20'
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5",cluster="mycluster"}'
values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.6",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.7",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.8",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.9",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.10",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.11",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.12",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.13",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.14",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.15",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.16",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.17",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.18",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.19",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.20",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.21",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.22",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.23",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.24",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.25",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.26",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.27",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.28",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.29",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.30",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.31",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.32",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.33",cluster="mycluster"}'
+ values: '1+0x20'
+
promql_expr_test:
- - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 4.00
+ - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 32.00
eval_time: 1m
exp_samples:
- labels: '{cluster="mycluster"}'
- value: 5
+ value: 33
alert_rule_test:
- eval_time: 5m
alertname: NVMeoFTooManyGateways
@@ -2347,7 +2404,7 @@ tests:
type: ceph_default
exp_annotations:
summary: "Max supported gateways exceeded on cluster mycluster"
- description: "You may create many gateways, but 4 is the tested limit"
+ description: "You may create many gateways, but 32 is the tested limit"
# NVMeoFMaxGatewayGroupSize
- interval: 1m
@@ -2362,16 +2419,24 @@ tests:
values: '1+0x20'
- series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.12",cluster="mycluster"}'
values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.10",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.14",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.11",cluster="mycluster"}'
+ values: '1+0x20'
+ - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.13",cluster="mycluster"}'
+ values: '1+0x20'
- series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}'
values: '1+0x20'
- series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}'
values: '1+0x20'
promql_expr_test:
- - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 4.00
+ - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 8.00
eval_time: 1m
exp_samples:
- labels: '{cluster="mycluster",group="group-1"}'
- value: 5
+ value: 9
alert_rule_test:
- eval_time: 5m
alertname: NVMeoFMaxGatewayGroupSize
@@ -2383,7 +2448,7 @@ tests:
type: ceph_default
exp_annotations:
summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster"
- description: "You may create many gateways in a gateway group, but 4 is the tested limit"
+ description: "You may create many gateways in a gateway group, but 8 is the tested limit"
# NVMeoFSingleGatewayGroup
- interval: 1m
@@ -2767,12 +2832,14 @@ tests:
values: '200+0x10'
- series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn10",cluster="mycluster"}'
values: '200+0x10'
+ - series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn11",cluster="mycluster"}'
+ values: '200+0x10'
promql_expr_test:
- - expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 1024
+ - expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 2048
eval_time: 1m
exp_samples:
- labels: '{gateway_host="node-1", cluster="mycluster"}'
- value: 2000
+ value: 2200
alert_rule_test:
- eval_time: 5m
alertname: NVMeoFTooManyNamespaces
@@ -2815,15 +2882,15 @@ tests:
- interval: 1m
input_series:
- series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1",cluster="mycluster"}'
- values: '2 2 2 4 4 8 8 8 10 10 20 20 32 34 34 38 38 40 44 44'
+ values: '2 4 8 10 20 30 40 50 62 74 80 95 100 110 130 130 130 130 130 130'
- series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2",cluster="mycluster"}'
- values: '2 2 2 8 8 8 16 16 16 16 16 16 16 16 16 16 16 16 16 16'
+ values: '2 8 16 16 16 16 16 16 16 16 20 20 32 34 34 36 37 37 37 37'
promql_expr_test:
- - expr: ceph_nvmeof_subsystem_host_count > 32.00
+ - expr: ceph_nvmeof_subsystem_host_count > 128.00
eval_time: 15m
exp_samples:
- labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1",cluster="mycluster"}'
- value: 38
+ value: 130
alert_rule_test:
- eval_time: 20m
alertname: NVMeoFHighClientCount
@@ -2835,7 +2902,7 @@ tests:
type: ceph_default
exp_annotations:
summary: "The number of clients connected to nqn1 is too high on cluster mycluster"
- description: "The supported limit for clients connecting to a subsystem is 32"
+ description: "The supported limit for clients connecting to a subsystem is 128"
# NVMeoFMissingListener
- interval: 1m