diff options
Diffstat (limited to 'monitoring/ceph-mixin')
-rw-r--r-- | monitoring/ceph-mixin/dashboards/host.libsonnet | 27 | ||||
-rw-r--r-- | monitoring/ceph-mixin/dashboards/osd.libsonnet | 25 | ||||
-rw-r--r-- | monitoring/ceph-mixin/dashboards_out/host-details.json | 87 | ||||
-rw-r--r-- | monitoring/ceph-mixin/dashboards_out/osds-overview.json | 85 | ||||
-rw-r--r-- | monitoring/ceph-mixin/prometheus_alerts.libsonnet | 22 | ||||
-rw-r--r-- | monitoring/ceph-mixin/prometheus_alerts.yml | 21 | ||||
-rw-r--r-- | monitoring/ceph-mixin/tests_alerts/test_alerts.yml | 89 | ||||
-rw-r--r-- | monitoring/ceph-mixin/tox.ini | 8 |
8 files changed, 346 insertions, 18 deletions
diff --git a/monitoring/ceph-mixin/dashboards/host.libsonnet b/monitoring/ceph-mixin/dashboards/host.libsonnet index 3e0b31f2c45..4fd35c3ede4 100644 --- a/monitoring/ceph-mixin/dashboards/host.libsonnet +++ b/monitoring/ceph-mixin/dashboards/host.libsonnet @@ -318,7 +318,7 @@ local g = import 'grafonnet/grafana.libsonnet'; .addTemplate( $.addTemplateSchema('ceph_hosts', '$datasource', - 'label_values({%(clusterMatcher)s}, instance)' % $.matchers(), + if $._config.showMultiCluster then ('label_values({%(clusterMatcher)s}, instance)' % $.matchers()) else 'label_values(instance)', 1, false, 3, @@ -719,5 +719,30 @@ local g = import 'grafonnet/grafana.libsonnet'; 11, 9 ), + $.addTableSchema( + '$datasource', + 'This table shows the 10 hosts with the highest number of slow ops', + { col: 2, desc: true }, + [ + $.overviewStyle('Instance', 'instance', 'string', 'short'), + $.overviewStyle('Slow Ops', 'Value', 'number', 'none'), + $.overviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Top Slow Ops per Host', + 'table' + ) + .addTarget( + $.addTargetSchema( + ||| + topk(10, + (sum by (instance)(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ) + { gridPos: { x: 0, y: 40, w: 4, h: 8 } }, ]), } diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet index 129b74ba666..0ea43c96ff9 100644 --- a/monitoring/ceph-mixin/dashboards/osd.libsonnet +++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet @@ -300,6 +300,31 @@ local g = import 'grafonnet/grafana.libsonnet'; .addTargets([$.addTargetSchema( 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes' )]), + $.addTableSchema( + '$datasource', + 'This table shows the 10 OSDs with the highest number of slow ops', + { col: 2, desc: true }, + [ + $.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'), + $.overviewStyle('Slow Ops', 'Value', 'number', 'none'), + $.overviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Top Slow Ops', + 'table' + ) + .addTarget( + $.addTargetSchema( + ||| + topk(10, + (ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ) + { gridPos: { x: 0, y: 20, w: 4, h: 8 } }, ]), 'osd-device-details.json': local OsdDeviceDetailsPanel(title, diff --git a/monitoring/ceph-mixin/dashboards_out/host-details.json b/monitoring/ceph-mixin/dashboards_out/host-details.json index 93c51f00941..7a5ac25917a 100644 --- a/monitoring/ceph-mixin/dashboards_out/host-details.json +++ b/monitoring/ceph-mixin/dashboards_out/host-details.json @@ -1119,6 +1119,91 @@ "show": true } ] + }, + { + "columns": [ ], + "datasource": "$datasource", + "description": "This table shows the 10 hosts with the highest number of slow ops", + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 40 + }, + "id": 15, + "links": [ ], + "sort": { + "col": 2, + "desc": true + }, + "styles": [ + { + "alias": "Instance", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "instance", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Slow Ops", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "none", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n (sum by (instance)(ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"}))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Slow Ops per Host", + "transform": "table", + "type": "table" } ], "refresh": "30s", @@ -1195,7 +1280,7 @@ "multi": false, "name": "ceph_hosts", "options": [ ], - "query": "label_values({}, instance)", + "query": "label_values(instance)", "refresh": 1, "regex": "([^.:]*).*", "sort": 3, diff --git a/monitoring/ceph-mixin/dashboards_out/osds-overview.json b/monitoring/ceph-mixin/dashboards_out/osds-overview.json index 5ea8955b297..b34c6642263 100644 --- a/monitoring/ceph-mixin/dashboards_out/osds-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/osds-overview.json @@ -860,6 +860,91 @@ "show": true } ] + }, + { + "columns": [ ], + "datasource": "$datasource", + "description": "This table shows the 10 OSDs with the highest number of slow ops", + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 20 + }, + "id": 13, + "links": [ ], + "sort": { + "col": 2, + "desc": true + }, + "styles": [ + { + "alias": "OSD ID", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "ceph_daemon", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Slow Ops", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "none", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n (ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"})\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Slow Ops", + "transform": "table", + "type": "table" } ], "refresh": "30s", diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index bed89a87906..b7ec0da2f04 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -552,6 +552,17 @@ }, }, { + alert: 'CephNodeNetworkBondDegraded', + expr: ||| + node_bonding_slaves - node_bonding_active != 0 + |||, + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Degraded Bond on Node {{ $labels.instance }}%(cluster)s' % $.MultiClusterSummary(), + description: 'Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}.', + }, + }, + { alert: 'CephNodeDiskspaceWarning', expr: 'predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0', labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.4' }, @@ -629,6 +640,17 @@ description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)', }, }, + { + alert: 'CephDaemonSlowOps', + 'for': '30s', + expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops', + summary: '{{ $labels.ceph_daemon }} operations are slow to complete', + description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)', + }, + }, ], }, { diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index a544d41eb0e..4a3e6acf389 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -495,6 +495,15 @@ groups: oid: "1.3.6.1.4.1.50495.1.2.1.8.3" severity: "warning" type: "ceph_default" + - alert: "CephNodeNetworkBondDegraded" + annotations: + summary: "Degraded Bond on Node {{ $labels.instance }}" + description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}." + expr: | + node_bonding_slaves - node_bonding_active != 0 + labels: + severity: "warning" + type: "ceph_default" - alert: "CephNodeDiskspaceWarning" annotations: description: "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate." @@ -518,7 +527,7 @@ groups: annotations: description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours." summary: "Pool growth rate may soon exceed capacity" - expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id) group_right ceph_pool_metadata) >= 95" + expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95" labels: oid: "1.3.6.1.4.1.50495.1.2.1.9.2" severity: "warning" @@ -563,6 +572,16 @@ groups: labels: severity: "warning" type: "ceph_default" + - alert: "CephDaemonSlowOps" + for: "30s" + expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0" + labels: + severity: 'warning' + type: 'ceph_default' + annotations: + summary: "{{ $labels.ceph_daemon }} operations are slow to complete" + description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops" - name: "cephadm" rules: - alert: "CephadmUpgradeFailed" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 7b7e7db7301..1aaea88e7f5 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -470,6 +470,37 @@ tests: summary: One or more NICs reports packet errors description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0." + # Bond is missing a peer + - interval: 1m + input_series: + - series: 'node_bonding_active{master="bond0", + instance="node-exporter",job="node-exporter"}' + values: '3' + - series: 'node_bonding_slaves{master="bond0", + instance="node-exporter",job="node-exporter"}' + values: '4' + promql_expr_test: + - expr: | + node_bonding_slaves - node_bonding_active != 0 + eval_time: 5m + exp_samples: + - labels: '{master="bond0", instance="node-exporter", + job="node-exporter"}' + value: 1 + alert_rule_test: + - eval_time: 5m + alertname: CephNodeNetworkBondDegraded + exp_alerts: + - exp_labels: + master: bond0 + instance: node-exporter + job: node-exporter + severity: warning + type: ceph_default + exp_annotations: + summary: Degraded Bond on Node node-exporter + description: "Bond bond0 is degraded on Node node-exporter." + # Node Storage disk space filling up - interval: 1m # 20GB = 21474836480, 256MB = 268435456 @@ -679,6 +710,33 @@ tests: summary: OSD operations are slow to complete description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)" + # slow daemon ops + - interval : 1m + input_series: + - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}' + values: '1+0x120' + promql_expr_test: + - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0' + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283", + job="ceph", type="SLOW_OPS"}' + value: 1 + alert_rule_test: + - eval_time: 20m + alertname: CephDaemonSlowOps + exp_alerts: + - exp_labels: + instance: ceph:9283 + ceph_daemon: "osd.1" + job: ceph + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops + summary: osd.1 operations are slow to complete + description: "osd.1 operations are taking too long to process (complaint time exceeded)" + # CEPHADM orchestrator alert triggers - interval: 30s input_series: @@ -1472,35 +1530,44 @@ tests: # trigger percent full prediction on pools 1 and 2 only - interval: 12h input_series: - - series: 'ceph_pool_percent_used{pool_id="1"}' - values: '70 75 80 87 92' - - series: 'ceph_pool_percent_used{pool_id="2"}' + - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}' + values: '78 89 79 98 78' + - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}' values: '22 22 23 23 24' - - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}' + - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}' values: '1 1 1 1 1' - - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}' + - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}' values: '1 1 1 1 1' promql_expr_test: - expr: | - (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id) - group_right ceph_pool_metadata) >= 95 + (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) + group_right() ceph_pool_metadata) >= 95 eval_time: 36h exp_samples: - - labels: '{name="rbd",pool_id="1",type="replicated"}' - value: 1.424E+02 # 142% + - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}' + value: 1.435E+02 # 142% alert_rule_test: - eval_time: 48h alertname: CephPoolGrowthWarning exp_alerts: - exp_labels: - name: rbd + instance: 8090 + name: default.rgw.index pool_id: 1 severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.1.2.1.9.2 exp_annotations: summary: Pool growth rate may soon exceed capacity - description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours. + description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours. - interval: 1m input_series: - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}' diff --git a/monitoring/ceph-mixin/tox.ini b/monitoring/ceph-mixin/tox.ini index df5bb46ded9..d0d97fc98b9 100644 --- a/monitoring/ceph-mixin/tox.ini +++ b/monitoring/ceph-mixin/tox.ini @@ -8,7 +8,7 @@ envlist = skipsdist = true [testenv:jsonnet-bundler-{install,update}] -whitelist_externals = +allowlist_externals = jb description = install: Install the jsonnet dependencies @@ -19,7 +19,7 @@ commands = [testenv:jsonnet-{check,fix,lint}] basepython = python3 -whitelist_externals = +allowlist_externals = find jb jsonnet @@ -56,7 +56,7 @@ deps = -rrequirements-lint.txt depends = grafonnet-check setenv = -whitelist_externals = +allowlist_externals = promtool commands = behave tests_dashboards/features @@ -66,7 +66,7 @@ deps = -rrequirements-alerts.txt pytest depends = grafonnet-check -whitelist_externals = +allowlist_externals = promtool commands = fix: jsonnet -J vendor -S alerts.jsonnet -o prometheus_alerts.yml |