summaryrefslogtreecommitdiffstats
path: root/monitoring
diff options
context:
space:
mode:
authorAashish Sharma <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com>2024-11-28 06:58:59 +0100
committerAashish Sharma <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com>2024-12-05 06:32:30 +0100
commit666f8faf11c0a639ce6ead06026850dd72a14d41 (patch)
tree28c0df079a8c9a7ff8d33a8e6342f9a3c80d2dd7 /monitoring
parentMerge pull request #60702 from ceph/wip-yuriw-release-17.2.8-main (diff)
downloadceph-666f8faf11c0a639ce6ead06026850dd72a14d41.tar.xz
ceph-666f8faf11c0a639ce6ead06026850dd72a14d41.zip
mgr/dashboard: Add ceph_daemon filter to rgw overview grafana panel
queries Currently rgw_servers filtering is not working in RGW Overview garfana graphs. It is showing data of all the RGW services, even though filter set to single service. This PR intends to solve this issue Fixes: https://tracker.ceph.com/issues/69074 Signed-off-by: Aashish Sharma <aasharma@redhat.com>
Diffstat (limited to 'monitoring')
-rw-r--r--monitoring/ceph-mixin/dashboards/rgw.libsonnet12
-rw-r--r--monitoring/ceph-mixin/dashboards_out/radosgw-overview.json12
-rw-r--r--monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature6
3 files changed, 18 insertions, 12 deletions
diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet
index 79a4b7a14eb..c0c548b79c8 100644
--- a/monitoring/ceph-mixin/dashboards/rgw.libsonnet
+++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet
@@ -298,7 +298,7 @@ local g = import 'grafonnet/grafana.libsonnet';
label_replace(
rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) /
rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
||| % $.matchers(),
@@ -314,7 +314,7 @@ local g = import 'grafonnet/grafana.libsonnet';
label_replace(
rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) /
rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
||| % $.matchers(),
@@ -331,7 +331,7 @@ local g = import 'grafonnet/grafana.libsonnet';
sum by (rgw_host) (
label_replace(
rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
)
@@ -351,7 +351,7 @@ local g = import 'grafonnet/grafana.libsonnet';
label_replace(
rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) /
rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
||| % $.matchers(),
@@ -385,7 +385,7 @@ local g = import 'grafonnet/grafana.libsonnet';
label_replace(sum by (instance_id) (
rate(ceph_rgw_op_get_obj_bytes{%(matchers)s}[$__rate_interval]) +
rate(ceph_rgw_op_put_obj_bytes{%(matchers)s}[$__rate_interval])) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
||| % $.matchers(),
@@ -404,7 +404,7 @@ local g = import 'grafonnet/grafana.libsonnet';
label_replace(
rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) /
rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
||| % $.matchers(),
diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json
index 5e185b63b7f..5bf8279c27c 100644
--- a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json
+++ b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json
@@ -108,14 +108,14 @@
"steppedLine": false,
"targets": [
{
- "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
+ "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "GET {{rgw_host}}",
"refId": "A"
},
{
- "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
+ "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "PUT {{rgw_host}}",
@@ -210,7 +210,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n",
+ "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{rgw_host}}",
@@ -305,7 +305,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
+ "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{rgw_host}}",
@@ -502,7 +502,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
+ "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{rgw_host}}",
@@ -597,7 +597,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
+ "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{rgw_host}}",
diff --git a/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature
index 8d96dcdd610..a34d5759437 100644
--- a/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature
+++ b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature
@@ -7,6 +7,7 @@ Scenario: "Test Average GET Latencies"
| ceph_rgw_op_get_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 20 60 80 |
| ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 |
When interval is `30s`
+ And variable `rgw_servers` is `rgw.foo`
Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `GET {{rgw_host}}` shows:
| metrics | values |
| {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1.5 |
@@ -18,6 +19,7 @@ Scenario: "Test Average PUT Latencies"
| ceph_rgw_op_put_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 10 30 50 |
| ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 |
When interval is `30s`
+ And variable `rgw_servers` is `rgw.foo`
Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `PUT {{rgw_host}}` shows:
| metrics | values |
| {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1 |
@@ -28,6 +30,7 @@ Scenario: "Test Total Requests/sec by RGW Instance"
| ceph_rgw_req{instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 10 50 100 |
| ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 1 1 1 |
When interval is `30s`
+ And variable `rgw_servers` is `rgw.1`
Then Grafana panel `Total Requests/sec by RGW Instance` with legend `{{rgw_host}}` shows:
| metrics | values |
| {rgw_host="1"} | 1.5 |
@@ -39,6 +42,7 @@ Scenario: "Test GET Latencies by RGW Instance"
| ceph_rgw_op_get_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 20 60 80 |
| ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 |
When interval is `30s`
+ And variable `rgw_servers` is `rgw.foo`
Then Grafana panel `GET Latencies by RGW Instance` with legend `{{rgw_host}}` shows:
| metrics | values |
| {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1.5 |
@@ -71,6 +75,7 @@ Scenario: "Test Bandwidth by RGW Instance"
| ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 1 1 1 |
When evaluation time is `1m`
And interval is `30s`
+ And variable `rgw_servers` is `rgw.1`
Then Grafana panel `Bandwidth by RGW Instance` with legend `{{rgw_host}}` shows:
| metrics | values |
| {ceph_daemon="rgw.1", instance_id="92806566", rgw_host="1"} | 2.25 |
@@ -83,6 +88,7 @@ Scenario: "Test PUT Latencies by RGW Instance"
| ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 |
When evaluation time is `1m`
And interval is `30s`
+ And variable `rgw_servers` is `rgw.foo`
Then Grafana panel `PUT Latencies by RGW Instance` with legend `{{rgw_host}}` shows:
| metrics | values |
| {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1 |