diff options
author | Adam King <adking@dhcp-41-165.bos.redhat.com> | 2024-10-14 19:44:03 +0200 |
---|---|---|
committer | Adam King <adking@redhat.com> | 2024-10-24 20:51:34 +0200 |
commit | 5818305e8094f88949a7a63c93c6d76d0efa03d9 (patch) | |
tree | 9008ddc4fdde1f8d24137f3f94672e9e255d30f9 /src/cephadm | |
parent | Merge pull request #59982 from rkachach/fix_issue_mgmt_gw_high_availability (diff) | |
download | ceph-5818305e8094f88949a7a63c93c6d76d0efa03d9.tar.xz ceph-5818305e8094f88949a7a63c93c6d76d0efa03d9.zip |
cephadm: handle "systemctl start" failures during deployment better
Previously it was assumed when the deploy command fails whatever
daemon we were trying to deploy does not exist on the host. However,
in the specific case where deploy fails trying to start the daemon's
systemd unit this is not the case. This leads us to both cleanup the
keyring for the daemon and also causes us to not trigger a refresh
of the daemons on the host which can make cephadm attempt to
deploy another daemon instead of just reporting the existing one
as failed. To get around this we need to handle that specific
failure as a success in the mgr module's deploy workflow so that
we refresh the daemons and report the failure as intended.
https://tracker.ceph.com/issues/68536
Signed-off-by: Adam King <adking@redhat.com>
Diffstat (limited to 'src/cephadm')
-rwxr-xr-x | src/cephadm/cephadm.py | 18 | ||||
-rw-r--r-- | src/cephadm/cephadmlib/constants.py | 1 | ||||
-rw-r--r-- | src/cephadm/cephadmlib/exceptions.py | 13 |
3 files changed, 29 insertions, 3 deletions
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index e32e2bc49f3..5d11d4700ff 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -57,6 +57,7 @@ from cephadmlib.constants import ( LOG_DIR_MODE, SYSCTL_DIR, UNIT_DIR, + DAEMON_FAILED_ERROR, ) from cephadmlib.context import CephadmContext from cephadmlib.context_getters import ( @@ -72,6 +73,7 @@ from cephadmlib.exceptions import ( ClusterAlreadyExists, Error, UnauthorizedRegistryError, + DaemonStartException, ) from cephadmlib.exe_utils import find_executable, find_program from cephadmlib.call_wrappers import ( @@ -1246,7 +1248,11 @@ def deploy_daemon_units( call_throws(ctx, ['systemctl', 'enable', unit_name]) if start: clean_cgroup(ctx, ident.fsid, unit_name) - call_throws(ctx, ['systemctl', 'start', unit_name]) + try: + call_throws(ctx, ['systemctl', 'start', unit_name]) + except Exception as e: + logger.error(f'systemctl start failed for {unit_name}: {str(e)}') + raise DaemonStartException() def _osd_unit_run_commands( @@ -3046,7 +3052,10 @@ def get_deployment_type( @deprecated_command def command_deploy(ctx): # type: (CephadmContext) -> None - _common_deploy(ctx) + try: + _common_deploy(ctx) + except DaemonStartException: + sys.exit(DAEMON_FAILED_ERROR) def apply_deploy_config_to_ctx( @@ -3089,7 +3098,10 @@ def command_deploy_from(ctx: CephadmContext) -> None: config_data = read_configuration_source(ctx) logger.debug('Loaded deploy configuration: %r', config_data) apply_deploy_config_to_ctx(config_data, ctx) - _common_deploy(ctx) + try: + _common_deploy(ctx) + except DaemonStartException: + sys.exit(DAEMON_FAILED_ERROR) def _common_deploy(ctx: CephadmContext) -> None: diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py index 354c3782398..6622f1eca55 100644 --- a/src/cephadm/cephadmlib/constants.py +++ b/src/cephadm/cephadmlib/constants.py @@ -54,3 +54,4 @@ QUIET_LOG_LEVEL = 9 # DEBUG is 10, so using 9 to be lower level than DEBUG NO_DEPRECATED = False UID_NOBODY = 65534 GID_NOGROUP = 65534 +DAEMON_FAILED_ERROR = 17 diff --git a/src/cephadm/cephadmlib/exceptions.py b/src/cephadm/cephadmlib/exceptions.py index 0d215fdd332..762ce782127 100644 --- a/src/cephadm/cephadmlib/exceptions.py +++ b/src/cephadm/cephadmlib/exceptions.py @@ -19,3 +19,16 @@ class UnauthorizedRegistryError(Error): class PortOccupiedError(Error): pass + + +class DaemonStartException(Exception): + """ + Special exception type we raise when the + systemctl start command fails during daemon + deployment. Necessary because the cephadm mgr module + needs to handle this case differently than a failure + earlier in the deploy process where no attempt was made + to actually start the daemon + """ + + pass |