summaryrefslogtreecommitdiffstats
path: root/src/cephadm
diff options
context:
space:
mode:
authorAdam King <adking@dhcp-41-165.bos.redhat.com>2024-10-14 19:44:03 +0200
committerAdam King <adking@redhat.com>2024-10-24 20:51:34 +0200
commit5818305e8094f88949a7a63c93c6d76d0efa03d9 (patch)
tree9008ddc4fdde1f8d24137f3f94672e9e255d30f9 /src/cephadm
parentMerge pull request #59982 from rkachach/fix_issue_mgmt_gw_high_availability (diff)
downloadceph-5818305e8094f88949a7a63c93c6d76d0efa03d9.tar.xz
ceph-5818305e8094f88949a7a63c93c6d76d0efa03d9.zip
cephadm: handle "systemctl start" failures during deployment better
Previously it was assumed when the deploy command fails whatever daemon we were trying to deploy does not exist on the host. However, in the specific case where deploy fails trying to start the daemon's systemd unit this is not the case. This leads us to both cleanup the keyring for the daemon and also causes us to not trigger a refresh of the daemons on the host which can make cephadm attempt to deploy another daemon instead of just reporting the existing one as failed. To get around this we need to handle that specific failure as a success in the mgr module's deploy workflow so that we refresh the daemons and report the failure as intended. https://tracker.ceph.com/issues/68536 Signed-off-by: Adam King <adking@redhat.com>
Diffstat (limited to 'src/cephadm')
-rwxr-xr-xsrc/cephadm/cephadm.py18
-rw-r--r--src/cephadm/cephadmlib/constants.py1
-rw-r--r--src/cephadm/cephadmlib/exceptions.py13
3 files changed, 29 insertions, 3 deletions
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index e32e2bc49f3..5d11d4700ff 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -57,6 +57,7 @@ from cephadmlib.constants import (
LOG_DIR_MODE,
SYSCTL_DIR,
UNIT_DIR,
+ DAEMON_FAILED_ERROR,
)
from cephadmlib.context import CephadmContext
from cephadmlib.context_getters import (
@@ -72,6 +73,7 @@ from cephadmlib.exceptions import (
ClusterAlreadyExists,
Error,
UnauthorizedRegistryError,
+ DaemonStartException,
)
from cephadmlib.exe_utils import find_executable, find_program
from cephadmlib.call_wrappers import (
@@ -1246,7 +1248,11 @@ def deploy_daemon_units(
call_throws(ctx, ['systemctl', 'enable', unit_name])
if start:
clean_cgroup(ctx, ident.fsid, unit_name)
- call_throws(ctx, ['systemctl', 'start', unit_name])
+ try:
+ call_throws(ctx, ['systemctl', 'start', unit_name])
+ except Exception as e:
+ logger.error(f'systemctl start failed for {unit_name}: {str(e)}')
+ raise DaemonStartException()
def _osd_unit_run_commands(
@@ -3046,7 +3052,10 @@ def get_deployment_type(
@deprecated_command
def command_deploy(ctx):
# type: (CephadmContext) -> None
- _common_deploy(ctx)
+ try:
+ _common_deploy(ctx)
+ except DaemonStartException:
+ sys.exit(DAEMON_FAILED_ERROR)
def apply_deploy_config_to_ctx(
@@ -3089,7 +3098,10 @@ def command_deploy_from(ctx: CephadmContext) -> None:
config_data = read_configuration_source(ctx)
logger.debug('Loaded deploy configuration: %r', config_data)
apply_deploy_config_to_ctx(config_data, ctx)
- _common_deploy(ctx)
+ try:
+ _common_deploy(ctx)
+ except DaemonStartException:
+ sys.exit(DAEMON_FAILED_ERROR)
def _common_deploy(ctx: CephadmContext) -> None:
diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py
index 354c3782398..6622f1eca55 100644
--- a/src/cephadm/cephadmlib/constants.py
+++ b/src/cephadm/cephadmlib/constants.py
@@ -54,3 +54,4 @@ QUIET_LOG_LEVEL = 9 # DEBUG is 10, so using 9 to be lower level than DEBUG
NO_DEPRECATED = False
UID_NOBODY = 65534
GID_NOGROUP = 65534
+DAEMON_FAILED_ERROR = 17
diff --git a/src/cephadm/cephadmlib/exceptions.py b/src/cephadm/cephadmlib/exceptions.py
index 0d215fdd332..762ce782127 100644
--- a/src/cephadm/cephadmlib/exceptions.py
+++ b/src/cephadm/cephadmlib/exceptions.py
@@ -19,3 +19,16 @@ class UnauthorizedRegistryError(Error):
class PortOccupiedError(Error):
pass
+
+
+class DaemonStartException(Exception):
+ """
+ Special exception type we raise when the
+ systemctl start command fails during daemon
+ deployment. Necessary because the cephadm mgr module
+ needs to handle this case differently than a failure
+ earlier in the deploy process where no attempt was made
+ to actually start the daemon
+ """
+
+ pass