From f3c18818b9b937550f503d94ca37f75639db5514 Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Thu, 9 Jan 2025 16:24:33 +0530 Subject: monitoring: Update nvmeof alert limits in config Update these in config.libsonnet: - NVMeoFMaxGatewaysPerGroup (4->8) - NVMeoFMaxGatewaysPerCluster (4->32) - NVMeoFMaxNamespaces (1024->2048) - NVMeoFHighClientCount (32->128) Also update prometheus_alerts.yml and test_alerts.yml accordingly. Signed-off-by: Vallari Agrawal --- monitoring/ceph-mixin/config.libsonnet | 8 +- monitoring/ceph-mixin/prometheus_alerts.yml | 14 ++-- monitoring/ceph-mixin/tests_alerts/test_alerts.yml | 93 +++++++++++++++++++--- 3 files changed, 91 insertions(+), 24 deletions(-) diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet index a15b88422fc..e917b4c2dac 100644 --- a/monitoring/ceph-mixin/config.libsonnet +++ b/monitoring/ceph-mixin/config.libsonnet @@ -9,12 +9,12 @@ CephNodeNetworkPacketDropsPerSec: 10, CephRBDMirrorImageTransferBandwidthThreshold: 0.8, CephRBDMirrorImagesPerDaemonThreshold: 100, - NVMeoFMaxGatewaysPerGroup: 4, - NVMeoFMaxGatewaysPerCluster: 4, + NVMeoFMaxGatewaysPerGroup: 8, + NVMeoFMaxGatewaysPerCluster: 32, NVMeoFHighGatewayCPU: 80, NVMeoFMaxSubsystemsPerGateway: 128, - NVMeoFMaxNamespaces: 1024, - NVMeoFHighClientCount: 32, + NVMeoFMaxNamespaces: 2048, + NVMeoFHighClientCount: 128, NVMeoFHighHostCPU: 80, // // Read/Write latency is defined in ms diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 3440d761351..7c0da4d51a4 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -776,18 +776,18 @@ groups: type: "ceph_default" - alert: "NVMeoFTooManyGateways" annotations: - description: "You may create many gateways, but 4 is the tested limit" + description: "You may create many gateways, but 32 is the tested limit" summary: "Max supported gateways exceeded on cluster {{ $labels.cluster }}" - expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 4.00" + expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 32.00" for: "1m" labels: severity: "warning" type: "ceph_default" - alert: "NVMeoFMaxGatewayGroupSize" annotations: - description: "You may create many gateways in a gateway group, but 4 is the tested limit" + description: "You may create many gateways in a gateway group, but 8 is the tested limit" summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}" - expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00" + expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00" for: "1m" labels: severity: "warning" @@ -832,7 +832,7 @@ groups: annotations: description: "Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported" summary: "The number of namespaces defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}" - expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 1024.00" + expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 2048.00" for: "1m" labels: severity: "warning" @@ -848,9 +848,9 @@ groups: type: "ceph_default" - alert: "NVMeoFHighClientCount" annotations: - description: "The supported limit for clients connecting to a subsystem is 32" + description: "The supported limit for clients connecting to a subsystem is 128" summary: "The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }}" - expr: "ceph_nvmeof_subsystem_host_count > 32.00" + expr: "ceph_nvmeof_subsystem_host_count > 128.00" for: "1m" labels: severity: "warning" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index b3b29308d08..83b4ff80375 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2331,12 +2331,69 @@ tests: values: '1+0x20' - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5",cluster="mycluster"}' values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.6",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.7",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.8",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.9",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.10",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.11",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.12",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.13",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.14",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.15",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.16",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.17",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.18",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.19",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.20",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.21",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.22",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.23",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.24",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.25",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.26",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.27",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.28",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.29",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.30",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.31",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.32",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.33",cluster="mycluster"}' + values: '1+0x20' + promql_expr_test: - - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 4.00 + - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 32.00 eval_time: 1m exp_samples: - labels: '{cluster="mycluster"}' - value: 5 + value: 33 alert_rule_test: - eval_time: 5m alertname: NVMeoFTooManyGateways @@ -2347,7 +2404,7 @@ tests: type: ceph_default exp_annotations: summary: "Max supported gateways exceeded on cluster mycluster" - description: "You may create many gateways, but 4 is the tested limit" + description: "You may create many gateways, but 32 is the tested limit" # NVMeoFMaxGatewayGroupSize - interval: 1m @@ -2362,16 +2419,24 @@ tests: values: '1+0x20' - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.12",cluster="mycluster"}' values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.10",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.14",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.11",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.13",cluster="mycluster"}' + values: '1+0x20' - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}' values: '1+0x20' - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}' values: '1+0x20' promql_expr_test: - - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 4.00 + - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 8.00 eval_time: 1m exp_samples: - labels: '{cluster="mycluster",group="group-1"}' - value: 5 + value: 9 alert_rule_test: - eval_time: 5m alertname: NVMeoFMaxGatewayGroupSize @@ -2383,7 +2448,7 @@ tests: type: ceph_default exp_annotations: summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster" - description: "You may create many gateways in a gateway group, but 4 is the tested limit" + description: "You may create many gateways in a gateway group, but 8 is the tested limit" # NVMeoFSingleGatewayGroup - interval: 1m @@ -2767,12 +2832,14 @@ tests: values: '200+0x10' - series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn10",cluster="mycluster"}' values: '200+0x10' + - series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn11",cluster="mycluster"}' + values: '200+0x10' promql_expr_test: - - expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 1024 + - expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 2048 eval_time: 1m exp_samples: - labels: '{gateway_host="node-1", cluster="mycluster"}' - value: 2000 + value: 2200 alert_rule_test: - eval_time: 5m alertname: NVMeoFTooManyNamespaces @@ -2815,15 +2882,15 @@ tests: - interval: 1m input_series: - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1",cluster="mycluster"}' - values: '2 2 2 4 4 8 8 8 10 10 20 20 32 34 34 38 38 40 44 44' + values: '2 4 8 10 20 30 40 50 62 74 80 95 100 110 130 130 130 130 130 130' - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2",cluster="mycluster"}' - values: '2 2 2 8 8 8 16 16 16 16 16 16 16 16 16 16 16 16 16 16' + values: '2 8 16 16 16 16 16 16 16 16 20 20 32 34 34 36 37 37 37 37' promql_expr_test: - - expr: ceph_nvmeof_subsystem_host_count > 32.00 + - expr: ceph_nvmeof_subsystem_host_count > 128.00 eval_time: 15m exp_samples: - labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1",cluster="mycluster"}' - value: 38 + value: 130 alert_rule_test: - eval_time: 20m alertname: NVMeoFHighClientCount @@ -2835,7 +2902,7 @@ tests: type: ceph_default exp_annotations: summary: "The number of clients connected to nqn1 is too high on cluster mycluster" - description: "The supported limit for clients connecting to a subsystem is 32" + description: "The supported limit for clients connecting to a subsystem is 128" # NVMeoFMissingListener - interval: 1m -- cgit v1.2.3