Merge pull request #54681 from adk3798/device-limit-testing

python-common/drive_selection: fix limit with existing devices Reviewed-by: Michael Fritch <mfritch@suse.com> Reviewed-by: Guillaume Abrioux <gabrioux@ibm.com>
author: Adam King <47704447+adk3798@users.noreply.github.com> 2023-12-13 18:15:05 +0100
committer: GitHub <noreply@github.com> 2023-12-13 18:15:05 +0100
commit: df7773bec7ce47930ee88f71a234d2c392cdb1a9 (patch)
tree: 0698f520e7966ad9b7b92392bdeb9cd5cb8ad5fe /src
parent: Merge pull request #54536 from phlogistonjohn/jjm-cephadm-units-n-scripts (diff)
parent: python-common/drive_selection: fix limit with existing devices (diff)
download: ceph-df7773bec7ce47930ee88f71a234d2c392cdb1a9.tar.xz
ceph-df7773bec7ce47930ee88f71a234d2c392cdb1a9.zip
4 files changed, 69 insertions, 12 deletions
diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py
index 3616d42deee..75b3fc58c76 100644
--- a/src/pybind/mgr/cephadm/services/osd.py
+++ b/src/pybind/mgr/cephadm/services/osd.py
@@ -319,11 +319,16 @@ class OSDService(CephService):
                             logger.exception('Cannot decode JSON: \'%s\'' % ' '.join(out))
                             concat_out = {}
                         notes = []
-                        if osdspec.data_devices is not None and osdspec.data_devices.limit and len(concat_out) < osdspec.data_devices.limit:
+                        if (
+                            osdspec.data_devices is not None
+                            and osdspec.data_devices.limit
+                            and (len(concat_out) + ds.existing_daemons) < osdspec.data_devices.limit
+                        ):
                             found = len(concat_out)
                             limit = osdspec.data_devices.limit
                             notes.append(
-                                f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit (Found: {found} | Limit: {limit})')
+                                f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit\n'
+                                f'(New Devices: {found} | Existing Matching Daemons: {ds.existing_daemons} | Limit: {limit})')
                         ret_all.append({'data': concat_out,
                                         'osdspec': osdspec.service_id,
                                         'host': host,
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index ebdf7eea5aa..2477de13e00 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -1193,7 +1193,8 @@ class TestCephadm(object):
     @mock.patch('cephadm.services.osd.OSDService.driveselection_to_ceph_volume')
     @mock.patch('cephadm.services.osd.OsdIdClaims.refresh', lambda _: None)
     @mock.patch('cephadm.services.osd.OsdIdClaims.get', lambda _: {})
-    def test_limit_not_reached(self, d_to_cv, _run_cv_cmd, cephadm_module):
+    @mock.patch('cephadm.inventory.HostCache.get_daemons_by_service')
+    def test_limit_not_reached(self, _get_daemons_by_service, d_to_cv, _run_cv_cmd, cephadm_module):
         with with_host(cephadm_module, 'test'):
             dg = DriveGroupSpec(placement=PlacementSpec(host_pattern='test'),
                                 data_devices=DeviceSelection(limit=5, rotational=1),
@@ -1203,12 +1204,14 @@ class TestCephadm(object):
                 '[{"data": "/dev/vdb", "data_size": "50.00 GB", "encryption": "None"}, {"data": "/dev/vdc", "data_size": "50.00 GB", "encryption": "None"}]']
             d_to_cv.return_value = 'foo'
             _run_cv_cmd.side_effect = async_side_effect((disks_found, '', 0))
+            _get_daemons_by_service.return_value = [DaemonDescription(daemon_type='osd', hostname='test', service_name='not_enough')]
             preview = cephadm_module.osd_service.generate_previews([dg], 'test')
 
             for osd in preview:
                 assert 'notes' in osd
                 assert osd['notes'] == [
-                    'NOTE: Did not find enough disks matching filter on host test to reach data device limit (Found: 2 | Limit: 5)']
+                    ('NOTE: Did not find enough disks matching filter on host test to reach '
+                     'data device limit\n(New Devices: 2 | Existing Matching Daemons: 1 | Limit: 5)')]
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
     def test_prepare_drivegroup(self, cephadm_module):
diff --git a/src/python-common/ceph/deployment/drive_selection/selector.py b/src/python-common/ceph/deployment/drive_selection/selector.py
index 1b3bfbb4ee3..31e330432cd 100644
--- a/src/python-common/ceph/deployment/drive_selection/selector.py
+++ b/src/python-common/ceph/deployment/drive_selection/selector.py
@@ -53,9 +53,12 @@ class DriveSelection(object):
         # type: () -> List[Device]
         return self._journal
 
-    def _limit_reached(self, device_filter, len_devices,
-                       disk_path):
-        # type: (DeviceSelection, int, str) -> bool
+    def _limit_reached(
+        self,
+        device_filter: DeviceSelection,
+        devices: List[Device],
+        disk_path: str
+    ) -> bool:
         """ Check for the <limit> property and apply logic
 
         If a limit is set in 'device_attrs' we have to stop adding
@@ -63,14 +66,21 @@ class DriveSelection(object):
 
         If limit is set (>0) and len(devices) >= limit
 
-        :param int len_devices: Length of the already populated device set/list
+        :param List[Device] devices: Already populated device set/list
         :param str disk_path: The disk identifier (for logging purposes)
         :return: True/False if the device should be added to the list of devices
         :rtype: bool
         """
         limit = device_filter.limit or 0
-
-        if limit > 0 and (len_devices + self.existing_daemons >= limit):
+        # If device A is being used for an OSD already, it can still
+        # match the filter (this is necessary as we still want the
+        # device in the resulting ceph-volume lvm batch command).
+        # If that is the case, we don't want to count the device
+        # towards the limit as it will already be counted through the
+        # existing daemons
+        non_ceph_devices = [d for d in devices if not d.ceph_device]
+
+        if limit > 0 and (len(non_ceph_devices) + self.existing_daemons >= limit):
             logger.debug("Refuse to add {} due to limit policy of <{}>".format(
                 disk_path, limit))
             return True
@@ -147,7 +157,7 @@ class DriveSelection(object):
                 continue
 
             # break on this condition.
-            if self._limit_reached(device_filter, len(devices), disk.path):
+            if self._limit_reached(device_filter, devices, disk.path):
                 logger.debug("Ignoring disk {}. Limit reached".format(
                     disk.path))
                 break
diff --git a/src/python-common/ceph/tests/test_disk_selector.py b/src/python-common/ceph/tests/test_disk_selector.py
index b08236130e0..03bfcbe16c9 100644
--- a/src/python-common/ceph/tests/test_disk_selector.py
+++ b/src/python-common/ceph/tests/test_disk_selector.py
@@ -557,4 +557,43 @@ class TestDriveSelection(object):
         inventory = _mk_inventory(_mk_device(rotational=True)*2)
         m = 'Failed to validate OSD spec "foobar.data_devices": No filters applied'
         with pytest.raises(DriveGroupValidationError, match=m):
-            drive_selection.DriveSelection(spec, inventory)
-\ No newline at end of file
+            drive_selection.DriveSelection(spec, inventory)
+
+
+class TestDeviceSelectionLimit:
+
+    def test_limit_existing_devices(self):
+        # Initial setup for this test is meant to be that /dev/sda
+        # is already being used for an OSD, hence it being marked
+        # as a ceph_device. /dev/sdb and /dev/sdc are not being used
+        # for OSDs yet. The limit will be set to 2 and the DriveSelection
+        # is set to have 1 pre-existing device (corresponding to /dev/sda)
+        dev_a = Device('/dev/sda', ceph_device=True, available=False)
+        dev_b = Device('/dev/sdb', ceph_device=False, available=True)
+        dev_c = Device('/dev/sdc', ceph_device=False, available=True)
+        all_devices: List[Device] = [dev_a, dev_b, dev_c]
+        processed_devices: List[Device] = []
+        filter = DeviceSelection(all=True, limit=2)
+        dgs = DriveGroupSpec(data_devices=filter)
+        ds = drive_selection.DriveSelection(dgs, all_devices, existing_daemons=1)
+
+        # Check /dev/sda. It's already being used for an OSD and will
+        # be counted in existing_daemons. This check should return False
+        # as we are not over the limit.
+        assert not ds._limit_reached(filter, processed_devices, '/dev/sda')
+        processed_devices.append(dev_a)
+
+        # We should still not be over the limit here with /dev/sdb since
+        # we will have only one pre-existing daemon /dev/sdb itself. This
+        # case previously failed as devices that contributed to existing_daemons
+        # would be double counted both as devices and daemons.
+        assert not ds._limit_reached(filter, processed_devices, '/dev/sdb')
+        processed_devices.append(dev_b)
+
+        # Now, with /dev/sdb and the pre-existing daemon taking up the 2
+        # slots, /dev/sdc should be rejected for us being over the limit.
+        assert ds._limit_reached(filter, processed_devices, '/dev/sdc')
+
+        # DriveSelection does device assignment on initialization. Let's check
+        # it picked up the expected devices
+        assert ds._data == [dev_a, dev_b]
author	Adam King <47704447+adk3798@users.noreply.github.com>	2023-12-13 18:15:05 +0100
committer	GitHub <noreply@github.com>	2023-12-13 18:15:05 +0100
commit	df7773bec7ce47930ee88f71a234d2c392cdb1a9 (patch)
tree	0698f520e7966ad9b7b92392bdeb9cd5cb8ad5fe /src
parent	Merge pull request #54536 from phlogistonjohn/jjm-cephadm-units-n-scripts (diff)
parent	python-common/drive_selection: fix limit with existing devices (diff)
download	ceph-df7773bec7ce47930ee88f71a234d2c392cdb1a9.tar.xz ceph-df7773bec7ce47930ee88f71a234d2c392cdb1a9.zip