summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSage Weil <sage@redhat.com>2020-03-12 19:13:11 +0100
committerSage Weil <sage@redhat.com>2020-03-12 19:13:11 +0100
commit0c62be9275e4b07e80190a81426263b6527f4420 (patch)
treef43a4bd20a7d1229d65285b4aa91f205bbe97fa9
parentmgr/orch: add pause/resume hooks (diff)
downloadceph-0c62be9275e4b07e80190a81426263b6527f4420.tar.xz
ceph-0c62be9275e4b07e80190a81426263b6527f4420.zip
mgr/cephadm: implement pause/resume to suspect non-monitoring background work
If the user does 'orch pause', suspend all background work that makes actual changes. Continue to do read-only operations, like checking host connectivity and scraping daemon and device status. Signed-off-by: Sage Weil <sage@redhat.com>
-rw-r--r--doc/cephadm/administration.rst12
-rw-r--r--src/pybind/mgr/cephadm/module.py50
2 files changed, 55 insertions, 7 deletions
diff --git a/doc/cephadm/administration.rst b/doc/cephadm/administration.rst
index 855f2df9402..afc1dd21cfe 100644
--- a/doc/cephadm/administration.rst
+++ b/doc/cephadm/administration.rst
@@ -31,6 +31,18 @@ To clear this value use the command:
Health checks
=============
+CEPHADM_PAUSED
+--------------
+
+Cephadm background work has been paused with ``ceph orch pause``. Cephadm
+will continue to perform passive monitoring activities (like checking
+host and daemon status), but it will not make any changes (like deploying
+or removing daemons).
+
+You can resume cephadm work with::
+
+ ceph orch resume
+
CEPHADM_STRAY_HOST
------------------
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 5d20c393e74..6d5c0c139d4 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -607,6 +607,11 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
self.run = True
self.event = Event()
+ if self.get_store('pause'):
+ self.paused = True
+ else:
+ self.paused = False
+
# for mypy which does not run the code
if TYPE_CHECKING:
self.ssh_config_file = None # type: Optional[str]
@@ -1049,7 +1054,6 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
self.log.debug("serve starting")
while self.run:
self._check_hosts()
- self.rm_util._remove_osds_bg()
# refresh daemons
self.log.debug('refreshing hosts')
@@ -1079,14 +1083,29 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
self._check_for_strays()
- if self._apply_all_services():
- continue # did something, refresh
+ if self.paused:
+ self.health_checks['CEPHADM_PAUSED'] = {
+ 'severity': 'warning',
+ 'summary': 'cephadm background work is paused',
+ 'count': 1,
+ 'detail': ["'ceph orch resume' to resume"],
+ }
+ self.set_health_checks(self.health_checks)
+ else:
+ if 'CEPHADM_PAUSED' in self.health_checks:
+ del self.health_checks['CEPHADM_PAUSED']
+ self.set_health_checks(self.health_checks)
+
+ self.rm_util._remove_osds_bg()
- self._check_daemons()
+ if self._apply_all_services():
+ continue # did something, refresh
- if self.upgrade_state and not self.upgrade_state.get('paused'):
- self._do_upgrade()
- continue
+ self._check_daemons()
+
+ if self.upgrade_state and not self.upgrade_state.get('paused'):
+ self._do_upgrade()
+ continue
self._serve_sleep()
self.log.debug("serve exit")
@@ -1112,6 +1131,23 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
def notify(self, notify_type, notify_id):
pass
+ def pause(self):
+ if not self.paused:
+ self.log.info('Paused')
+ self.set_store('pause', 'true')
+ self.paused = True
+ # wake loop so we update the health status
+ self._kick_serve_loop()
+
+ def resume(self):
+ if self.paused:
+ self.log.info('Resumed')
+ self.paused = False
+ self.set_store('pause', None)
+ # unconditionally wake loop so that 'orch resume' can be used to kick
+ # cephadm
+ self._kick_serve_loop()
+
def get_unique_name(self, daemon_type, host, existing, prefix=None,
forcename=None):
# type: (str, str, List[orchestrator.DaemonDescription], Optional[str], Optional[str]) -> str