diff options
author | Sage Weil <sage@redhat.com> | 2020-03-12 19:13:11 +0100 |
---|---|---|
committer | Sage Weil <sage@redhat.com> | 2020-03-12 19:13:11 +0100 |
commit | 0c62be9275e4b07e80190a81426263b6527f4420 (patch) | |
tree | f43a4bd20a7d1229d65285b4aa91f205bbe97fa9 | |
parent | mgr/orch: add pause/resume hooks (diff) | |
download | ceph-0c62be9275e4b07e80190a81426263b6527f4420.tar.xz ceph-0c62be9275e4b07e80190a81426263b6527f4420.zip |
mgr/cephadm: implement pause/resume to suspect non-monitoring background work
If the user does 'orch pause', suspend all background work that makes
actual changes.
Continue to do read-only operations, like checking host connectivity
and scraping daemon and device status.
Signed-off-by: Sage Weil <sage@redhat.com>
-rw-r--r-- | doc/cephadm/administration.rst | 12 | ||||
-rw-r--r-- | src/pybind/mgr/cephadm/module.py | 50 |
2 files changed, 55 insertions, 7 deletions
diff --git a/doc/cephadm/administration.rst b/doc/cephadm/administration.rst index 855f2df9402..afc1dd21cfe 100644 --- a/doc/cephadm/administration.rst +++ b/doc/cephadm/administration.rst @@ -31,6 +31,18 @@ To clear this value use the command: Health checks ============= +CEPHADM_PAUSED +-------------- + +Cephadm background work has been paused with ``ceph orch pause``. Cephadm +will continue to perform passive monitoring activities (like checking +host and daemon status), but it will not make any changes (like deploying +or removing daemons). + +You can resume cephadm work with:: + + ceph orch resume + CEPHADM_STRAY_HOST ------------------ diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 5d20c393e74..6d5c0c139d4 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -607,6 +607,11 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule): self.run = True self.event = Event() + if self.get_store('pause'): + self.paused = True + else: + self.paused = False + # for mypy which does not run the code if TYPE_CHECKING: self.ssh_config_file = None # type: Optional[str] @@ -1049,7 +1054,6 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule): self.log.debug("serve starting") while self.run: self._check_hosts() - self.rm_util._remove_osds_bg() # refresh daemons self.log.debug('refreshing hosts') @@ -1079,14 +1083,29 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule): self._check_for_strays() - if self._apply_all_services(): - continue # did something, refresh + if self.paused: + self.health_checks['CEPHADM_PAUSED'] = { + 'severity': 'warning', + 'summary': 'cephadm background work is paused', + 'count': 1, + 'detail': ["'ceph orch resume' to resume"], + } + self.set_health_checks(self.health_checks) + else: + if 'CEPHADM_PAUSED' in self.health_checks: + del self.health_checks['CEPHADM_PAUSED'] + self.set_health_checks(self.health_checks) + + self.rm_util._remove_osds_bg() - self._check_daemons() + if self._apply_all_services(): + continue # did something, refresh - if self.upgrade_state and not self.upgrade_state.get('paused'): - self._do_upgrade() - continue + self._check_daemons() + + if self.upgrade_state and not self.upgrade_state.get('paused'): + self._do_upgrade() + continue self._serve_sleep() self.log.debug("serve exit") @@ -1112,6 +1131,23 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule): def notify(self, notify_type, notify_id): pass + def pause(self): + if not self.paused: + self.log.info('Paused') + self.set_store('pause', 'true') + self.paused = True + # wake loop so we update the health status + self._kick_serve_loop() + + def resume(self): + if self.paused: + self.log.info('Resumed') + self.paused = False + self.set_store('pause', None) + # unconditionally wake loop so that 'orch resume' can be used to kick + # cephadm + self._kick_serve_loop() + def get_unique_name(self, daemon_type, host, existing, prefix=None, forcename=None): # type: (str, str, List[orchestrator.DaemonDescription], Optional[str], Optional[str]) -> str |