cephadm: handle "systemctl start" failures during deployment better

Previously it was assumed when the deploy command fails whatever daemon we were trying to deploy does not exist on the host. However, in the specific case where deploy fails trying to start the daemon's systemd unit this is not the case. This leads us to both cleanup the keyring for the daemon and also causes us to not trigger a refresh of the daemons on the host which can make cephadm attempt to deploy another daemon instead of just reporting the existing one as failed. To get around this we need to handle that specific failure as a success in the mgr module's deploy workflow so that we refresh the daemons and report the failure as intended. https://tracker.ceph.com/issues/68536 Signed-off-by: Adam King <adking@redhat.com>
author: Adam King <adking@dhcp-41-165.bos.redhat.com> 2024-10-14 19:44:03 +0200
committer: Adam King <adking@redhat.com> 2024-10-24 20:51:34 +0200
commit: 5818305e8094f88949a7a63c93c6d76d0efa03d9 (patch)
tree: 9008ddc4fdde1f8d24137f3f94672e9e255d30f9 /src/cephadm
parent: Merge pull request #59982 from rkachach/fix_issue_mgmt_gw_high_availability (diff)
download: ceph-5818305e8094f88949a7a63c93c6d76d0efa03d9.tar.xz
ceph-5818305e8094f88949a7a63c93c6d76d0efa03d9.zip
3 files changed, 29 insertions, 3 deletions
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index e32e2bc49f3..5d11d4700ff 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -57,6 +57,7 @@ from cephadmlib.constants import (
     LOG_DIR_MODE,
     SYSCTL_DIR,
     UNIT_DIR,
+    DAEMON_FAILED_ERROR,
 )
 from cephadmlib.context import CephadmContext
 from cephadmlib.context_getters import (
@@ -72,6 +73,7 @@ from cephadmlib.exceptions import (
     ClusterAlreadyExists,
     Error,
     UnauthorizedRegistryError,
+    DaemonStartException,
 )
 from cephadmlib.exe_utils import find_executable, find_program
 from cephadmlib.call_wrappers import (
@@ -1246,7 +1248,11 @@ def deploy_daemon_units(
         call_throws(ctx, ['systemctl', 'enable', unit_name])
     if start:
         clean_cgroup(ctx, ident.fsid, unit_name)
-        call_throws(ctx, ['systemctl', 'start', unit_name])
+        try:
+            call_throws(ctx, ['systemctl', 'start', unit_name])
+        except Exception as e:
+            logger.error(f'systemctl start failed for {unit_name}: {str(e)}')
+            raise DaemonStartException()
 
 
 def _osd_unit_run_commands(
@@ -3046,7 +3052,10 @@ def get_deployment_type(
 @deprecated_command
 def command_deploy(ctx):
     # type: (CephadmContext) -> None
-    _common_deploy(ctx)
+    try:
+        _common_deploy(ctx)
+    except DaemonStartException:
+        sys.exit(DAEMON_FAILED_ERROR)
 
 
 def apply_deploy_config_to_ctx(
@@ -3089,7 +3098,10 @@ def command_deploy_from(ctx: CephadmContext) -> None:
     config_data = read_configuration_source(ctx)
     logger.debug('Loaded deploy configuration: %r', config_data)
     apply_deploy_config_to_ctx(config_data, ctx)
-    _common_deploy(ctx)
+    try:
+        _common_deploy(ctx)
+    except DaemonStartException:
+        sys.exit(DAEMON_FAILED_ERROR)
 
 
 def _common_deploy(ctx: CephadmContext) -> None:
diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py
index 354c3782398..6622f1eca55 100644
--- a/src/cephadm/cephadmlib/constants.py
+++ b/src/cephadm/cephadmlib/constants.py
@@ -54,3 +54,4 @@ QUIET_LOG_LEVEL = 9  # DEBUG is 10, so using 9 to be lower level than DEBUG
 NO_DEPRECATED = False
 UID_NOBODY = 65534
 GID_NOGROUP = 65534
+DAEMON_FAILED_ERROR = 17
diff --git a/src/cephadm/cephadmlib/exceptions.py b/src/cephadm/cephadmlib/exceptions.py
index 0d215fdd332..762ce782127 100644
--- a/src/cephadm/cephadmlib/exceptions.py
+++ b/src/cephadm/cephadmlib/exceptions.py
@@ -19,3 +19,16 @@ class UnauthorizedRegistryError(Error):
 
 class PortOccupiedError(Error):
     pass
+
+
+class DaemonStartException(Exception):
+    """
+    Special exception type we raise when the
+    systemctl start command fails during daemon
+    deployment. Necessary because the cephadm mgr module
+    needs to handle this case differently than a failure
+    earlier in the deploy process where no attempt was made
+    to actually start the daemon
+    """
+
+    pass
author	Adam King <adking@dhcp-41-165.bos.redhat.com>	2024-10-14 19:44:03 +0200
committer	Adam King <adking@redhat.com>	2024-10-24 20:51:34 +0200
commit	5818305e8094f88949a7a63c93c6d76d0efa03d9 (patch)
tree	9008ddc4fdde1f8d24137f3f94672e9e255d30f9 /src/cephadm
parent	Merge pull request #59982 from rkachach/fix_issue_mgmt_gw_high_availability (diff)
download	ceph-5818305e8094f88949a7a63c93c6d76d0efa03d9.tar.xz ceph-5818305e8094f88949a7a63c93c6d76d0efa03d9.zip