qa/tasks/check_counter.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

import logging
import json
import errno

from teuthology.task import Task
from teuthology import misc

from tasks import ceph_manager
from tasks.cephfs.filesystem import MDSCluster
from teuthology.exceptions import CommandFailedError

log = logging.getLogger(__name__)


class CheckCounter(Task):
    """
    Use this task to validate that some daemon perf counters were
    incremented by the nested tasks.

    Config:
     'cluster_name': optional, specify which cluster
     'target': dictionary of daemon type to list of performance counters.
     'dry_run': just log the value of the counters, don't fail if they
                aren't nonzero.

    Success condition is that for all of the named counters, at least
    one of the daemons of that type has the counter nonzero.

    Example to check cephfs dirfrag splits are happening:
    - install:
    - ceph:
    - ceph-fuse:
    - check-counter:
        counters:
            mds:
                - "mds.dir_split"
                -
                    name: "mds.dir_update"
                    min: 3
    - workunit: ...
    """
    @property
    def admin_remote(self):
        first_mon = misc.get_first_mon(self.ctx, None)
        (result,) = self.ctx.cluster.only(first_mon).remotes.keys()
        return result

    def start(self):
        log.info("START")

    def end(self):
        overrides = self.ctx.config.get('overrides', {})
        misc.deep_merge(self.config, overrides.get('check-counter', {}))

        cluster_name = self.config.get('cluster_name', None)
        dry_run = self.config.get('dry_run', False)
        targets = self.config.get('counters', {})

        if cluster_name is None:
            cluster_name = next(iter(self.ctx.managers.keys()))


        mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=self.ctx, logger=log.getChild('ceph_manager'))
        active_mgr = json.loads(mon_manager.raw_cluster_cmd("mgr", "dump", "--format=json-pretty"))["active_name"]

        mds_cluster = MDSCluster(self.ctx)
        status = mds_cluster.status()

        for daemon_type, counters in targets.items():
            # List of 'a', 'b', 'c'...
            daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type))
            daemons = dict([(daemon_id,
                             self.ctx.daemons.get_daemon(daemon_type, daemon_id))
                            for daemon_id in daemon_ids])

            expected = set()
            seen = set()

            for daemon_id, daemon in daemons.items():
                if not daemon.running():
                    log.info("Ignoring daemon {0}, it isn't running".format(daemon_id))
                    continue
                elif daemon_type == 'mgr' and daemon_id != active_mgr:
                    continue
                else:
                    log.debug("Getting stats from {0}".format(daemon_id))

                if daemon_type == 'mds':
                    mds_info = status.get_mds(daemon_id)
                    if not mds_info:
                        continue
                    mds = f"mds.{mds_info['gid']}"
                    if mds_info['state'] != "up:active":
                        log.debug(f"skipping {mds}")
                        continue
                    log.debug(f"Getting stats from {mds}")
                    try:
                        proc = mon_manager.raw_cluster_cmd("tell", mds, "perf", "dump",
                                                           "--format=json-pretty")
                        response_data = proc.strip()
                    except CommandFailedError as e:
                        if e.exitstatus == errno.ENOENT:
                            log.debug(f"Failed to do 'perf dump' on {mds}")
                        continue
                else:
                    manager = self.ctx.managers[cluster_name]
                    proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"])
                    response_data = proc.stdout.getvalue().strip()
                if response_data:
                    perf_dump = json.loads(response_data)
                else:
                    log.warning("No response from {0}, skipping".format(daemon_id))
                    continue

                minval = ''
                expected_val = ''
                for counter in counters:
                    if isinstance(counter, dict):
                        name = counter['name']
                        if 'min' in counter:
                            minval = counter['min']
                        if 'expected_val' in counter:
                            expected_val = counter['expected_val']
                    else:
                        name = counter
                        minval = 1
                    expected.add(name)

                    val = perf_dump
                    for key in name.split('.'):
                        if key not in val:
                            log.warning(f"Counter '{name}' not found on daemon {daemon_type}.{daemon_id}")
                            val = None
                            break

                        val = val[key]

                    if val is not None:
                        log.info(f"Daemon {daemon_type}.{daemon_id} {name}={val}")
                        if isinstance(minval, int) and val >= minval:
                            seen.add(name)
                        elif isinstance(expected_val, int) and val == expected_val:
                            seen.add(name)

            if not dry_run:
                unseen = set(expected) - set(seen)
                if unseen:
                    raise RuntimeError("The following counters failed to be set "
                                       "on {0} daemons: {1}".format(
                        daemon_type, unseen
                    ))

task = CheckCounter