qa/tasks/mon_connection_score.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

from tasks.ceph_test_case import CephTestCase
import json
import logging
log = logging.getLogger(__name__)


class TestStretchClusterNew(CephTestCase):

    CLUSTER = "ceph"
    MONS = {
            "a": {
                "rank": 0,
                },
            "b": {
                "rank": 1,
            },
            "c": {
                "rank": 2,
            }
        }
    WRITE_PERIOD = 10
    RECOVERY_PERIOD = WRITE_PERIOD * 6
    SUCCESS_HOLD_TIME = 10

    def setUp(self):
        """
        Set up the cluster for the test.
        """
        super(TestStretchClusterNew, self).setUp()

    def tearDown(self):
        """
        Clean up the cluter after the test.
        """
        super(TestStretchClusterNew, self).tearDown()

    def _check_connection_score(self):
        """
        Check the connection score of all the mons.
        """
        for mon, _ in self.MONS.items():
            # get the connection score
            cscore = self.ceph_cluster.mon_manager.raw_cluster_cmd(
                'daemon', 'mon.{}'.format(mon),
                'connection', 'scores', 'dump')
            # parse the connection score
            cscore = json.loads(cscore)
            # check if the current mon rank is correct
            if cscore["rank"] != self.MONS[mon]["rank"]:
                log.error(
                    "Rank mismatch {} != {}".format(
                        cscore["rank"], self.MONS[mon]["rank"]
                    )
                )
                return False
            # check if current mon have all the peer reports and ourself
            if len(cscore['reports']) != len(self.MONS):
                log.error(
                    "Reports count mismatch {}".format(cscore['reports'])
                )
                return False

            for report in cscore["reports"]:
                report_rank = []
                for peer in report["peer_scores"]:
                    # check if the peer is alive
                    if not peer["peer_alive"]:
                        log.error("Peer {} is not alive".format(peer))
                        return False
                    report_rank.append(peer["peer_rank"])

                # check if current mon has all the ranks and no duplicates
                expected_ranks = [
                    rank
                    for data in self.MONS.values()
                    for rank in data.values()
                ]
                if report_rank.sort() != expected_ranks.sort():
                    log.error("Rank mismatch in report {}".format(report))
                    return False

        log.info("Connection score is clean!")
        return True

    def test_connection_score(self):
        # check if all mons are in quorum
        self.ceph_cluster.mon_manager.wait_for_mon_quorum_size(3)
        # check if all connection scores reflect this
        self.wait_until_true_and_hold(
            lambda: self._check_connection_score(),
            # Wait for 4 minutes for the connection score to recover
            timeout=self.RECOVERY_PERIOD * 4,
            # Hold the clean connection score for 60 seconds
            success_hold_time=self.SUCCESS_HOLD_TIME * 6
        )