8 files changed, 346 insertions, 18 deletions
diff --git a/monitoring/ceph-mixin/dashboards/host.libsonnet b/monitoring/ceph-mixin/dashboards/host.libsonnet
index 3e0b31f2c45..4fd35c3ede4 100644
--- a/monitoring/ceph-mixin/dashboards/host.libsonnet
+++ b/monitoring/ceph-mixin/dashboards/host.libsonnet
@@ -318,7 +318,7 @@ local g = import 'grafonnet/grafana.libsonnet';
     .addTemplate(
       $.addTemplateSchema('ceph_hosts',
                           '$datasource',
-                          'label_values({%(clusterMatcher)s}, instance)' % $.matchers(),
+                          if $._config.showMultiCluster then ('label_values({%(clusterMatcher)s}, instance)' % $.matchers()) else 'label_values(instance)',
                           1,
                           false,
                           3,
@@ -719,5 +719,30 @@ local g = import 'grafonnet/grafana.libsonnet';
         11,
         9
       ),
+      $.addTableSchema(
+        '$datasource',
+        'This table shows the 10 hosts with the highest number of slow ops',
+        { col: 2, desc: true },
+        [
+          $.overviewStyle('Instance', 'instance', 'string', 'short'),
+          $.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
+          $.overviewStyle('', '/.*/', 'hidden', 'short'),
+        ],
+        'Top Slow Ops per Host',
+        'table'
+      )
+      .addTarget(
+        $.addTargetSchema(
+          |||
+            topk(10,
+              (sum by (instance)(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}))
+            )
+          ||| % $.matchers(),
+          '',
+          'table',
+          1,
+          true
+        )
+      ) + { gridPos: { x: 0, y: 40, w: 4, h: 8 } },
     ]),
 }
diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet
index 129b74ba666..0ea43c96ff9 100644
--- a/monitoring/ceph-mixin/dashboards/osd.libsonnet
+++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet
@@ -300,6 +300,31 @@ local g = import 'grafonnet/grafana.libsonnet';
       .addTargets([$.addTargetSchema(
         'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
       )]),
+      $.addTableSchema(
+        '$datasource',
+        'This table shows the 10 OSDs with the highest number of slow ops',
+        { col: 2, desc: true },
+        [
+          $.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
+          $.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
+          $.overviewStyle('', '/.*/', 'hidden', 'short'),
+        ],
+        'Top Slow Ops',
+        'table'
+      )
+      .addTarget(
+        $.addTargetSchema(
+          |||
+            topk(10,
+              (ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})
+            )
+          ||| % $.matchers(),
+          '',
+          'table',
+          1,
+          true
+        )
+      ) + { gridPos: { x: 0, y: 20, w: 4, h: 8 } },
     ]),
   'osd-device-details.json':
     local OsdDeviceDetailsPanel(title,
diff --git a/monitoring/ceph-mixin/dashboards_out/host-details.json b/monitoring/ceph-mixin/dashboards_out/host-details.json
index 93c51f00941..7a5ac25917a 100644
--- a/monitoring/ceph-mixin/dashboards_out/host-details.json
+++ b/monitoring/ceph-mixin/dashboards_out/host-details.json
@@ -1119,6 +1119,91 @@
                "show": true
             }
          ]
+      },
+      {
+         "columns": [ ],
+         "datasource": "$datasource",
+         "description": "This table shows the 10 hosts with the highest number of slow ops",
+         "gridPos": {
+            "h": 8,
+            "w": 4,
+            "x": 0,
+            "y": 40
+         },
+         "id": 15,
+         "links": [ ],
+         "sort": {
+            "col": 2,
+            "desc": true
+         },
+         "styles": [
+            {
+               "alias": "Instance",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "instance",
+               "thresholds": [ ],
+               "type": "string",
+               "unit": "short",
+               "valueMaps": [ ]
+            },
+            {
+               "alias": "Slow Ops",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "Value",
+               "thresholds": [ ],
+               "type": "number",
+               "unit": "none",
+               "valueMaps": [ ]
+            },
+            {
+               "alias": "",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "/.*/",
+               "thresholds": [ ],
+               "type": "hidden",
+               "unit": "short",
+               "valueMaps": [ ]
+            }
+         ],
+         "targets": [
+            {
+               "expr": "topk(10,\n  (sum by (instance)(ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"}))\n)\n",
+               "format": "table",
+               "instant": true,
+               "intervalFactor": 1,
+               "legendFormat": "",
+               "refId": "A"
+            }
+         ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Top Slow Ops per Host",
+         "transform": "table",
+         "type": "table"
       }
    ],
    "refresh": "30s",
@@ -1195,7 +1280,7 @@
             "multi": false,
             "name": "ceph_hosts",
             "options": [ ],
-            "query": "label_values({}, instance)",
+            "query": "label_values(instance)",
             "refresh": 1,
             "regex": "([^.:]*).*",
             "sort": 3,
diff --git a/monitoring/ceph-mixin/dashboards_out/osds-overview.json b/monitoring/ceph-mixin/dashboards_out/osds-overview.json
index 5ea8955b297..b34c6642263 100644
--- a/monitoring/ceph-mixin/dashboards_out/osds-overview.json
+++ b/monitoring/ceph-mixin/dashboards_out/osds-overview.json
@@ -860,6 +860,91 @@
                "show": true
             }
          ]
+      },
+      {
+         "columns": [ ],
+         "datasource": "$datasource",
+         "description": "This table shows the 10 OSDs with the highest number of slow ops",
+         "gridPos": {
+            "h": 8,
+            "w": 4,
+            "x": 0,
+            "y": 20
+         },
+         "id": 13,
+         "links": [ ],
+         "sort": {
+            "col": 2,
+            "desc": true
+         },
+         "styles": [
+            {
+               "alias": "OSD ID",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "ceph_daemon",
+               "thresholds": [ ],
+               "type": "string",
+               "unit": "short",
+               "valueMaps": [ ]
+            },
+            {
+               "alias": "Slow Ops",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "Value",
+               "thresholds": [ ],
+               "type": "number",
+               "unit": "none",
+               "valueMaps": [ ]
+            },
+            {
+               "alias": "",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "/.*/",
+               "thresholds": [ ],
+               "type": "hidden",
+               "unit": "short",
+               "valueMaps": [ ]
+            }
+         ],
+         "targets": [
+            {
+               "expr": "topk(10,\n  (ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"})\n)\n",
+               "format": "table",
+               "instant": true,
+               "intervalFactor": 1,
+               "legendFormat": "",
+               "refId": "A"
+            }
+         ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Top Slow Ops",
+         "transform": "table",
+         "type": "table"
       }
    ],
    "refresh": "30s",
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
index bed89a87906..b7ec0da2f04 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -552,6 +552,17 @@
           },
         },
         {
+          alert: 'CephNodeNetworkBondDegraded',
+          expr: |||
+            node_bonding_slaves - node_bonding_active != 0
+          |||,
+          labels: { severity: 'warning', type: 'ceph_default' },
+          annotations: {
+            summary: 'Degraded Bond on Node {{ $labels.instance }}%(cluster)s' % $.MultiClusterSummary(),
+            description: 'Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}.',
+          },
+        },
+        {
           alert: 'CephNodeDiskspaceWarning',
           expr: 'predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0',
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.4' },
@@ -629,6 +640,17 @@
             description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)',
           },
         },
+        {
+          alert: 'CephDaemonSlowOps',
+          'for': '30s',
+          expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0',
+          labels: { severity: 'warning', type: 'ceph_default' },
+          annotations: {
+            documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops',
+            summary: '{{ $labels.ceph_daemon }} operations are slow to complete',
+            description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)',
+          },
+        },
       ],
     },
     {
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index a544d41eb0e..4a3e6acf389 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -495,6 +495,15 @@ groups:
           oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
           severity: "warning"
           type: "ceph_default"
+      - alert: "CephNodeNetworkBondDegraded"
+        annotations:
+          summary: "Degraded Bond on Node {{ $labels.instance }}"
+          description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}."
+        expr: |
+          node_bonding_slaves - node_bonding_active != 0
+        labels:
+          severity: "warning"
+          type: "ceph_default"
       - alert: "CephNodeDiskspaceWarning"
         annotations:
           description: "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate."
@@ -518,7 +527,7 @@ groups:
         annotations:
           description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
           summary: "Pool growth rate may soon exceed capacity"
-        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)    group_right ceph_pool_metadata) >= 95"
+        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
           severity: "warning"
@@ -563,6 +572,16 @@ groups:
         labels:
           severity: "warning"
           type: "ceph_default"
+      - alert: "CephDaemonSlowOps"
+        for: "30s"
+        expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
+        labels: 
+          severity: 'warning'
+          type: 'ceph_default'
+        annotations:
+          summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
+          description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
   - name: "cephadm"
     rules:
       - alert: "CephadmUpgradeFailed"
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index 7b7e7db7301..1aaea88e7f5 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -470,6 +470,37 @@ tests:
            summary: One or more NICs reports packet errors
            description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
 
+ # Bond is missing a peer
+ - interval: 1m
+   input_series:
+    - series: 'node_bonding_active{master="bond0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '3'
+    - series: 'node_bonding_slaves{master="bond0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '4'
+   promql_expr_test:
+     - expr: |
+         node_bonding_slaves - node_bonding_active != 0
+       eval_time: 5m
+       exp_samples:
+         - labels: '{master="bond0", instance="node-exporter",
+           job="node-exporter"}'
+           value: 1
+   alert_rule_test:
+     - eval_time: 5m
+       alertname: CephNodeNetworkBondDegraded
+       exp_alerts:
+       - exp_labels:
+           master: bond0
+           instance: node-exporter
+           job: node-exporter
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           summary: Degraded Bond on Node node-exporter
+           description: "Bond bond0 is degraded on Node node-exporter."
+
 # Node Storage disk space filling up
  - interval: 1m
    # 20GB = 21474836480, 256MB = 268435456
@@ -679,6 +710,33 @@ tests:
            summary: OSD operations are slow to complete
            description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
 
+ # slow daemon ops
+ - interval : 1m
+   input_series:
+    - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
+      values: '1+0x120'
+   promql_expr_test:
+     - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
+           job="ceph", type="SLOW_OPS"}'
+           value: 1
+   alert_rule_test:
+     - eval_time: 20m
+       alertname: CephDaemonSlowOps
+       exp_alerts:
+       - exp_labels:
+           instance: ceph:9283
+           ceph_daemon: "osd.1"
+           job: ceph
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
+           summary: osd.1 operations are slow to complete
+           description: "osd.1 operations are taking too long to process (complaint time exceeded)"
+
 # CEPHADM orchestrator alert triggers
  - interval: 30s
    input_series:
@@ -1472,35 +1530,44 @@ tests:
    # trigger percent full prediction on pools 1 and 2 only
  - interval: 12h
    input_series:
-    - series: 'ceph_pool_percent_used{pool_id="1"}'
-      values: '70 75 80 87 92'
-    - series: 'ceph_pool_percent_used{pool_id="2"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
+      values: '78 89 79 98 78'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
       values: '22 22 23 23 24'
-    - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
       values: '1 1 1 1 1'
    promql_expr_test:
      - expr: |
-         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
-              group_right ceph_pool_metadata) >= 95
+         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
+              group_right() ceph_pool_metadata) >= 95
        eval_time: 36h
        exp_samples:
-         - labels: '{name="rbd",pool_id="1",type="replicated"}'
-           value: 1.424E+02 # 142%
+         - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
+           value: 1.435E+02 # 142%
    alert_rule_test:
     - eval_time: 48h
       alertname: CephPoolGrowthWarning
       exp_alerts:
       - exp_labels:
-          name: rbd
+          instance: 8090
+          name: default.rgw.index
           pool_id: 1
           severity: warning
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.9.2
         exp_annotations:
           summary: Pool growth rate may soon exceed capacity
-          description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
+          description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
  - interval: 1m
    input_series:
     - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
diff --git a/monitoring/ceph-mixin/tox.ini b/monitoring/ceph-mixin/tox.ini
index df5bb46ded9..d0d97fc98b9 100644
--- a/monitoring/ceph-mixin/tox.ini
+++ b/monitoring/ceph-mixin/tox.ini
@@ -8,7 +8,7 @@ envlist =
 skipsdist = true
 
 [testenv:jsonnet-bundler-{install,update}]
-whitelist_externals =
+allowlist_externals =
     jb
 description =
     install: Install the jsonnet dependencies
@@ -19,7 +19,7 @@ commands =
 
 [testenv:jsonnet-{check,fix,lint}]
 basepython = python3
-whitelist_externals =
+allowlist_externals =
     find
     jb
     jsonnet
@@ -56,7 +56,7 @@ deps =
     -rrequirements-lint.txt
 depends = grafonnet-check
 setenv =
-whitelist_externals =
+allowlist_externals =
   promtool
 commands =
     behave tests_dashboards/features 
@@ -66,7 +66,7 @@ deps =
     -rrequirements-alerts.txt
     pytest
 depends = grafonnet-check
-whitelist_externals =
+allowlist_externals =
   promtool
 commands =
     fix: jsonnet -J vendor -S alerts.jsonnet -o prometheus_alerts.yml