summaryrefslogtreecommitdiffstats
path: root/qa/tasks/stretch_mode_disable_enable.py
diff options
context:
space:
mode:
Diffstat (limited to 'qa/tasks/stretch_mode_disable_enable.py')
-rw-r--r--qa/tasks/stretch_mode_disable_enable.py547
1 files changed, 547 insertions, 0 deletions
diff --git a/qa/tasks/stretch_mode_disable_enable.py b/qa/tasks/stretch_mode_disable_enable.py
new file mode 100644
index 00000000000..a84a85bb307
--- /dev/null
+++ b/qa/tasks/stretch_mode_disable_enable.py
@@ -0,0 +1,547 @@
+import logging
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+log = logging.getLogger(__name__)
+
+class TestStretchMode(MgrTestCase):
+ """
+ Test the stretch mode feature of Ceph
+ """
+ POOL = 'stretch_pool'
+ CLUSTER = "ceph"
+ WRITE_PERIOD = 10
+ RECOVERY_PERIOD = WRITE_PERIOD * 6
+ SUCCESS_HOLD_TIME = 7
+ STRETCH_CRUSH_RULE = 'stretch_rule'
+ STRETCH_CRUSH_RULE_ID = None
+ STRETCH_BUCKET_TYPE = 'datacenter'
+ TIEBREAKER_MON_NAME = 'e'
+ DEFAULT_POOL_TYPE = 'replicated'
+ DEFAULT_POOL_CRUSH_RULE = 'replicated_rule'
+ DEFAULT_POOL_SIZE = 3
+ DEFAULT_POOL_MIN_SIZE = 2
+ DEFAULT_POOL_CRUSH_RULE_ID = None
+ # This dictionary maps the datacenter to the osd ids and hosts
+ DC_OSDS = {
+ 'dc1': {
+ "host01": [0, 1],
+ "host02": [2, 3],
+ },
+ 'dc2': {
+ "host03": [4, 5],
+ "host04": [6, 7],
+ },
+ }
+ DC_MONS = {
+ 'dc1': {
+ "host01": ['a'],
+ "host02": ['b'],
+ },
+ 'dc2': {
+ "host03": ['c'],
+ "host04": ['d'],
+ },
+ 'dc3': {
+ "host05": ['e'],
+ }
+ }
+ def _osd_count(self):
+ """
+ Get the number of OSDs in the cluster.
+ """
+ osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
+ return len(osd_map['osds'])
+
+ def setUp(self):
+ """
+ Setup the cluster and
+ ensure we have a clean condition before the test.
+ """
+ # Ensure we have at least 6 OSDs
+ super(TestStretchMode, self).setUp()
+ self.DEFAULT_POOL_CRUSH_RULE_ID = self.mgr_cluster.mon_manager.get_crush_rule_id(self.DEFAULT_POOL_CRUSH_RULE)
+ self.STRETCH_CRUSH_RULE_ID = self.mgr_cluster.mon_manager.get_crush_rule_id(self.STRETCH_CRUSH_RULE)
+ if self._osd_count() < 4:
+ self.skipTest("Not enough OSDS!")
+
+ # Remove any filesystems so that we can remove their pools
+ if self.mds_cluster:
+ self.mds_cluster.mds_stop()
+ self.mds_cluster.mds_fail()
+ self.mds_cluster.delete_all_filesystems()
+
+ # Remove all other pools
+ for pool in self.mgr_cluster.mon_manager.get_osd_dump_json()['pools']:
+ try:
+ self.mgr_cluster.mon_manager.remove_pool(pool['pool_name'])
+ except:
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'delete',
+ pool['pool_name'],
+ pool['pool_name'],
+ '--yes-i-really-really-mean-it')
+
+ def _setup_pool(
+ self,
+ pool_name=POOL,
+ pg_num=16,
+ pool_type=DEFAULT_POOL_TYPE,
+ crush_rule=DEFAULT_POOL_CRUSH_RULE,
+ size=None,
+ min_size=None
+ ):
+ """
+ Create a pool, set its size and pool if specified.
+ """
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'create', pool_name, str(pg_num), pool_type, crush_rule)
+
+ if size is not None:
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'set', pool_name, 'size', str(size))
+
+ if min_size is not None:
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'set', pool_name, 'min_size', str(min_size))
+
+ def _write_some_data(self, t):
+ """
+ Write some data to the pool to simulate a workload.
+ """
+ args = [
+ "rados", "-p", self.POOL, "bench", str(t), "write", "-t", "16"]
+ self.mgr_cluster.admin_remote.run(args=args, wait=True)
+
+ def _get_all_mons_from_all_dc(self):
+ """
+ Get all mons from all datacenters.
+ """
+ return [mon for dc in self.DC_MONS.values() for mons in dc.values() for mon in mons]
+
+ def _bring_back_mon(self, mon):
+ """
+ Bring back the mon.
+ """
+ try:
+ self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).restart()
+ except Exception:
+ log.error("Failed to bring back mon.{}".format(str(mon)))
+ pass
+
+ def _get_host(self, osd):
+ """
+ Get the host of the osd.
+ """
+ for dc, nodes in self.DC_OSDS.items():
+ for node, osds in nodes.items():
+ if osd in osds:
+ return node
+ return None
+
+ def _move_osd_back_to_host(self, osd):
+ """
+ Move the osd back to the host.
+ """
+ host = self._get_host(osd)
+ assert host is not None, "The host of osd {} is not found.".format(osd)
+ log.debug("Moving osd.%d back to %s", osd, host)
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'crush', 'move', 'osd.{}'.format(str(osd)),
+ 'host={}'.format(host)
+ )
+
+ def tearDown(self):
+ """
+ Clean up the cluster after the test.
+ """
+ # Remove the pool
+ if self.POOL in self.mgr_cluster.mon_manager.pools:
+ self.mgr_cluster.mon_manager.remove_pool(self.POOL)
+
+ osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
+ for osd in osd_map['osds']:
+ # mark all the osds in
+ if osd['weight'] == 0.0:
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'in', str(osd['osd']))
+ # Bring back all the osds and move it back to the host.
+ if osd['up'] == 0:
+ self.mgr_cluster.mon_manager.revive_osd(osd['osd'])
+ self._move_osd_back_to_host(osd['osd'])
+
+ # Bring back all the mons
+ mons = self._get_all_mons_from_all_dc()
+ for mon in mons:
+ self._bring_back_mon(mon)
+ super(TestStretchMode, self).tearDown()
+
+ def _kill_osd(self, osd):
+ """
+ Kill the osd.
+ """
+ try:
+ self.ctx.daemons.get_daemon('osd', osd, self.CLUSTER).stop()
+ except Exception:
+ log.error("Failed to stop osd.{}".format(str(osd)))
+ pass
+
+ def _get_osds_data(self, want_osds):
+ """
+ Get the osd data
+ """
+ all_osds_data = \
+ self.mgr_cluster.mon_manager.get_osd_dump_json()['osds']
+ return [
+ osd_data for osd_data in all_osds_data
+ if int(osd_data['osd']) in want_osds
+ ]
+
+ def _get_osds_by_dc(self, dc):
+ """
+ Get osds by datacenter.
+ """
+ ret = []
+ for host, osds in self.DC_OSDS[dc].items():
+ ret.extend(osds)
+ return ret
+
+ def _fail_over_all_osds_in_dc(self, dc):
+ """
+ Fail over all osds in specified <datacenter>
+ """
+ if not isinstance(dc, str):
+ raise ValueError("dc must be a string")
+ if dc not in self.DC_OSDS:
+ raise ValueError(
+ "dc must be one of the following: %s" % self.DC_OSDS.keys()
+ )
+ log.debug("Failing over all osds in %s", dc)
+ osds = self._get_osds_by_dc(dc)
+ # fail over all the OSDs in the DC
+ log.debug("OSDs to failed over: %s", osds)
+ for osd_id in osds:
+ self._kill_osd(osd_id)
+ # wait until all the osds are down
+ self.wait_until_true(
+ lambda: all([int(osd['up']) == 0
+ for osd in self._get_osds_data(osds)]),
+ timeout=self.RECOVERY_PERIOD
+ )
+
+ def _check_mons_out_of_quorum(self, want_mons):
+ """
+ Check if the mons are not in quorum.
+ """
+ quorum_names = self.mgr_cluster.mon_manager.get_mon_quorum_names()
+ return all([mon not in quorum_names for mon in want_mons])
+
+ def _kill_mon(self, mon):
+ """
+ Kill the mon.
+ """
+ try:
+ self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).stop()
+ except Exception:
+ log.error("Failed to stop mon.{}".format(str(mon)))
+ pass
+
+ def _get_mons_by_dc(self, dc):
+ """
+ Get mons by datacenter.
+ """
+ ret = []
+ for host, mons in self.DC_MONS[dc].items():
+ ret.extend(mons)
+ return ret
+
+ def _fail_over_all_mons_in_dc(self, dc):
+ """
+ Fail over all mons in the specified <datacenter>
+ """
+ if not isinstance(dc, str):
+ raise ValueError("dc must be a string")
+ if dc not in self.DC_MONS:
+ raise ValueError("dc must be one of the following: %s" %
+ ", ".join(self.DC_MONS.keys()))
+ log.debug("Failing over all mons %s", dc)
+ mons = self._get_mons_by_dc(dc)
+ log.debug("Mons to be failed over: %s", mons)
+ for mon in mons:
+ self._kill_mon(mon)
+ # wait until all the mons are out of quorum
+ self.wait_until_true(
+ lambda: self._check_mons_out_of_quorum(mons),
+ timeout=self.RECOVERY_PERIOD
+ )
+
+ def _stretch_mode_enabled_correctly(self):
+ """
+ Evaluate whether the stretch mode is enabled correctly.
+ by checking the OSDMap and MonMap.
+ """
+ # Checking the OSDMap
+ osdmap = self.mgr_cluster.mon_manager.get_osd_dump_json()
+ for pool in osdmap['pools']:
+ # expects crush_rule to be stretch_rule
+ self.assertEqual(
+ self.STRETCH_CRUSH_RULE_ID,
+ pool['crush_rule']
+ )
+ # expects pool size to be 4
+ self.assertEqual(
+ 4,
+ pool['size']
+ )
+ # expects pool min_size to be 2
+ self.assertEqual(
+ 2,
+ pool['min_size']
+ )
+ # expects pool is_stretch_pool flag to be true
+ self.assertEqual(
+ True,
+ pool['is_stretch_pool']
+ )
+ # expects peering_crush_bucket_count = 2 (always this value for stretch mode)
+ self.assertEqual(
+ 2,
+ pool['peering_crush_bucket_count']
+ )
+ # expects peering_crush_bucket_target = 2 (always this value for stretch mode)
+ self.assertEqual(
+ 2,
+ pool['peering_crush_bucket_target']
+ )
+ # expects peering_crush_bucket_barrier = 8 (crush type of datacenter is 8)
+ self.assertEqual(
+ 8,
+ pool['peering_crush_bucket_barrier']
+ )
+ # expects stretch_mode_enabled to be True
+ self.assertEqual(
+ True,
+ osdmap['stretch_mode']['stretch_mode_enabled']
+ )
+ # expects stretch_mode_bucket_count to be 2
+ self.assertEqual(
+ 2,
+ osdmap['stretch_mode']['stretch_bucket_count']
+ )
+ # expects degraded_stretch_mode to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['degraded_stretch_mode']
+ )
+ # expects recovering_stretch_mode to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['recovering_stretch_mode']
+ )
+ # expects stretch_mode_bucket to be 8 (datacenter crush type = 8)
+ self.assertEqual(
+ 8,
+ osdmap['stretch_mode']['stretch_mode_bucket']
+ )
+ # Checking the MonMap
+ monmap = self.mgr_cluster.mon_manager.get_mon_dump_json()
+ # expects stretch_mode to be True
+ self.assertEqual(
+ True,
+ monmap['stretch_mode']
+ )
+ # expects disallowed_leaders to be tiebreaker_mon
+ self.assertEqual(
+ self.TIEBREAKER_MON_NAME,
+ monmap['disallowed_leaders']
+ )
+ # expects tiebreaker_mon to be tiebreaker_mon
+ self.assertEqual(
+ self.TIEBREAKER_MON_NAME,
+ monmap['tiebreaker_mon']
+ )
+
+ def _stretch_mode_disabled_correctly(self):
+ """
+ Evaluate whether the stretch mode is disabled correctly.
+ by checking the OSDMap and MonMap.
+ """
+ # Checking the OSDMap
+ osdmap = self.mgr_cluster.mon_manager.get_osd_dump_json()
+ for pool in osdmap['pools']:
+ # expects crush_rule to be default
+ self.assertEqual(
+ self.DEFAULT_POOL_CRUSH_RULE_ID,
+ pool['crush_rule']
+ )
+ # expects pool size to be default
+ self.assertEqual(
+ self.DEFAULT_POOL_SIZE,
+ pool['size']
+ )
+ # expects pool min_size to be default
+ self.assertEqual(
+ self.DEFAULT_POOL_MIN_SIZE,
+ pool['min_size']
+ )
+ # expects pool is_stretch_pool flag to be false
+ self.assertEqual(
+ False,
+ pool['is_stretch_pool']
+ )
+ # expects peering_crush_bucket_count = 0
+ self.assertEqual(
+ 0,
+ pool['peering_crush_bucket_count']
+ )
+ # expects peering_crush_bucket_target = 0
+ self.assertEqual(
+ 0,
+ pool['peering_crush_bucket_target']
+ )
+ # expects peering_crush_bucket_barrier = 0
+ self.assertEqual(
+ 0,
+ pool['peering_crush_bucket_barrier']
+ )
+ # expects stretch_mode_enabled to be False
+ self.assertEqual(
+ False,
+ osdmap['stretch_mode']['stretch_mode_enabled']
+ )
+ # expects stretch_mode_bucket to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['stretch_bucket_count']
+ )
+ # expects degraded_stretch_mode to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['degraded_stretch_mode']
+ )
+ # expects recovering_stretch_mode to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['recovering_stretch_mode']
+ )
+ # expects stretch_mode_bucket to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['stretch_mode_bucket']
+ )
+ # Checking the MonMap
+ monmap = self.mgr_cluster.mon_manager.get_mon_dump_json()
+ # expects stretch_mode to be False
+ self.assertEqual(
+ False,
+ monmap['stretch_mode']
+ )
+ # expects disallowed_leaders to be empty
+ self.assertEqual(
+ "",
+ monmap['disallowed_leaders']
+ )
+ # expects tiebreaker_mon to be empty
+ self.assertEqual(
+ "",
+ monmap['tiebreaker_mon']
+ )
+
+ def test_disable_stretch_mode(self):
+ """
+ Test disabling stretch mode with the following scenario:
+ 1. Healthy Stretch Mode
+ 2. Degraded Stretch Mode
+ """
+ # Create a pool
+ self._setup_pool(self.POOL, 16, 'replicated', self.STRETCH_CRUSH_RULE, 4, 2)
+ # Write some data to the pool
+ self._write_some_data(self.WRITE_PERIOD)
+ # disable stretch mode without --yes-i-really-mean-it (expects -EPERM 1)
+ self.assertEqual(
+ 1,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'disable_stretch_mode'
+ ))
+ # Disable stretch mode with non-existent crush rule (expects -EINVAL 22)
+ self.assertEqual(
+ 22,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'disable_stretch_mode',
+ 'non_existent_rule',
+ '--yes-i-really-mean-it'
+ ))
+ # Disable stretch mode with the current stretch rule (expect -EINVAL 22)
+ self.assertEqual(
+ 22,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'disable_stretch_mode',
+ self.STRETCH_CRUSH_RULE,
+ '--yes-i-really-mean-it',
+
+ ))
+ # Disable stretch mode without crush rule (expect success 0)
+ self.assertEqual(
+ 0,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'disable_stretch_mode',
+ '--yes-i-really-mean-it'
+ ))
+ # Check if stretch mode is disabled correctly
+ self._stretch_mode_disabled_correctly()
+ # all PGs are active + clean
+ self.wait_until_true_and_hold(
+ lambda: self.mgr_cluster.mon_manager.pg_all_active_clean(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
+ # write some data to the pool
+ self._write_some_data(self.WRITE_PERIOD)
+ # Enable stretch mode
+ self.assertEqual(
+ 0,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'enable_stretch_mode',
+ self.TIEBREAKER_MON_NAME,
+ self.STRETCH_CRUSH_RULE,
+ self.STRETCH_BUCKET_TYPE
+ ))
+ self._stretch_mode_enabled_correctly()
+ # all PGs are active + clean
+ self.wait_until_true_and_hold(
+ lambda: self.mgr_cluster.mon_manager.pg_all_active_clean(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
+ # write some data to the pool
+ # self._write_some_data(self.WRITE_PERIOD)
+ # Bring down dc1
+ self._fail_over_all_osds_in_dc('dc1')
+ self._fail_over_all_mons_in_dc('dc1')
+ # should be in degraded stretch mode
+ self.wait_until_true_and_hold(
+ lambda: self.mgr_cluster.mon_manager.is_degraded_stretch_mode(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
+ # Disable stretch mode with valid crush rule (expect success 0)
+ self.assertEqual(
+ 0,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'disable_stretch_mode',
+ self.DEFAULT_POOL_CRUSH_RULE,
+ '--yes-i-really-mean-it'
+ ))
+ # Check if stretch mode is disabled correctly
+ self._stretch_mode_disabled_correctly()
+ # all PGs are active
+ self.wait_until_true_and_hold(
+ lambda: self.mgr_cluster.mon_manager.pg_all_active(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )