diff options
author | Aashish Sharma <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com> | 2024-11-28 06:58:59 +0100 |
---|---|---|
committer | Aashish Sharma <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com> | 2024-12-05 06:32:30 +0100 |
commit | 666f8faf11c0a639ce6ead06026850dd72a14d41 (patch) | |
tree | 28c0df079a8c9a7ff8d33a8e6342f9a3c80d2dd7 /monitoring | |
parent | Merge pull request #60702 from ceph/wip-yuriw-release-17.2.8-main (diff) | |
download | ceph-666f8faf11c0a639ce6ead06026850dd72a14d41.tar.xz ceph-666f8faf11c0a639ce6ead06026850dd72a14d41.zip |
mgr/dashboard: Add ceph_daemon filter to rgw overview grafana panel
queries
Currently rgw_servers filtering is not working in RGW Overview garfana graphs.
It is showing data of all the RGW services, even though filter set to single service.
This PR intends to solve this issue
Fixes: https://tracker.ceph.com/issues/69074
Signed-off-by: Aashish Sharma <aasharma@redhat.com>
Diffstat (limited to 'monitoring')
3 files changed, 18 insertions, 12 deletions
diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet index 79a4b7a14eb..c0c548b79c8 100644 --- a/monitoring/ceph-mixin/dashboards/rgw.libsonnet +++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -298,7 +298,7 @@ local g = import 'grafonnet/grafana.libsonnet'; label_replace( rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -314,7 +314,7 @@ local g = import 'grafonnet/grafana.libsonnet'; label_replace( rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -331,7 +331,7 @@ local g = import 'grafonnet/grafana.libsonnet'; sum by (rgw_host) ( label_replace( rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ) @@ -351,7 +351,7 @@ local g = import 'grafonnet/grafana.libsonnet'; label_replace( rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -385,7 +385,7 @@ local g = import 'grafonnet/grafana.libsonnet'; label_replace(sum by (instance_id) ( rate(ceph_rgw_op_get_obj_bytes{%(matchers)s}[$__rate_interval]) + rate(ceph_rgw_op_put_obj_bytes{%(matchers)s}[$__rate_interval])) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -404,7 +404,7 @@ local g = import 'grafonnet/grafana.libsonnet'; label_replace( rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json index 5e185b63b7f..5bf8279c27c 100644 --- a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json @@ -108,14 +108,14 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "GET {{rgw_host}}", "refId": "A" }, { - "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUT {{rgw_host}}", @@ -210,7 +210,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", + "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -305,7 +305,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -502,7 +502,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -597,7 +597,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", diff --git a/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature index 8d96dcdd610..a34d5759437 100644 --- a/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature +++ b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature @@ -7,6 +7,7 @@ Scenario: "Test Average GET Latencies" | ceph_rgw_op_get_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 20 60 80 | | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `GET {{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1.5 | @@ -18,6 +19,7 @@ Scenario: "Test Average PUT Latencies" | ceph_rgw_op_put_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 10 30 50 | | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `PUT {{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1 | @@ -28,6 +30,7 @@ Scenario: "Test Total Requests/sec by RGW Instance" | ceph_rgw_req{instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 10 50 100 | | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.1` Then Grafana panel `Total Requests/sec by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {rgw_host="1"} | 1.5 | @@ -39,6 +42,7 @@ Scenario: "Test GET Latencies by RGW Instance" | ceph_rgw_op_get_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 20 60 80 | | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `GET Latencies by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1.5 | @@ -71,6 +75,7 @@ Scenario: "Test Bandwidth by RGW Instance" | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 1 1 1 | When evaluation time is `1m` And interval is `30s` + And variable `rgw_servers` is `rgw.1` Then Grafana panel `Bandwidth by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.1", instance_id="92806566", rgw_host="1"} | 2.25 | @@ -83,6 +88,7 @@ Scenario: "Test PUT Latencies by RGW Instance" | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 | When evaluation time is `1m` And interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `PUT Latencies by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1 | |