diff options
author | Sage Weil <sage@newdream.net> | 2021-06-19 18:21:47 +0200 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2021-06-23 19:00:49 +0200 |
commit | 24ac3088b6dcbcfd845ee35ddde382b9c8251053 (patch) | |
tree | fd304a9bc2021805217dd7096fb727a32eac0968 /src/pybind/mgr/crash/module.py | |
parent | mgr: generate crash dump for python exceptions (diff) | |
download | ceph-24ac3088b6dcbcfd845ee35ddde382b9c8251053.tar.xz ceph-24ac3088b6dcbcfd845ee35ddde382b9c8251053.zip |
mgr/crash: separate RECENT_MGR_MODULE_CRASH error for mgr module crashes
Generate a different warning for crashes in mgr module python code, as
they do not mean that the entire mgr daemon crashed. Document.
Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'src/pybind/mgr/crash/module.py')
-rw-r--r-- | src/pybind/mgr/crash/module.py | 60 |
1 files changed, 45 insertions, 15 deletions
diff --git a/src/pybind/mgr/crash/module.py b/src/pybind/mgr/crash/module.py index 1cbe2026fb9..739555e6270 100644 --- a/src/pybind/mgr/crash/module.py +++ b/src/pybind/mgr/crash/module.py @@ -100,25 +100,55 @@ class Module(MgrModule): if (self.time_from_string(cast(str, crash['timestamp'])) > cutoff and 'archived' not in crash) } - num = len(recent) - health_checks: Dict[str, Dict[str, Union[int, str, List[str]]]] = {} - if recent: - detail = [ - '%s crashed on host %s at %s' % ( - crash.get('entity_name', 'unidentified daemon'), - crash.get('utsname_hostname', '(unknown)'), - crash.get('timestamp', 'unknown time')) - for crash in recent.values()] + + def prune_detail(ls: List[str]) -> int: + num = len(ls) if num > 30: - detail = detail[0:30] - detail.append('and %d more' % (num - 30)) - self.log.debug('detail %s' % detail) + ls = ls[0:30] + ls.append('and %d more' % (num - 30)) + return num + + daemon_crashes = [] + module_crashes = [] + for c in recent.values(): + if 'mgr_module' in c: + module_crashes.append(c) + else: + daemon_crashes.append(c) + daemon_detail = [ + '%s crashed on host %s at %s' % ( + crash.get('entity_name', 'unidentified daemon'), + crash.get('utsname_hostname', '(unknown)'), + crash.get('timestamp', 'unknown time')) + for crash in daemon_crashes] + module_detail = [ + 'mgr module %s crashed in daemon %s on host %s at %s' % ( + crash.get('mgr_module', 'unidentified module'), + crash.get('entity_name', 'unidentified daemon'), + crash.get('utsname_hostname', '(unknown)'), + crash.get('timestamp', 'unknown time')) + for crash in module_crashes] + daemon_num = prune_detail(daemon_detail) + module_num = prune_detail(module_detail) + + health_checks: Dict[str, Dict[str, Union[int, str, List[str]]]] = {} + if daemon_detail: + self.log.debug('daemon detail %s' % daemon_detail) health_checks['RECENT_CRASH'] = { 'severity': 'warning', - 'summary': '%d daemons have recently crashed' % (num), - 'count': num, - 'detail': detail, + 'summary': '%d daemons have recently crashed' % (daemon_num), + 'count': daemon_num, + 'detail': daemon_detail, } + if module_detail: + self.log.debug('module detail %s' % module_detail) + health_checks['RECENT_MGR_MODULE_CRASH'] = { + 'severity': 'warning', + 'summary': '%d mgr modules have recently crashed' % (module_num), + 'count': module_num, + 'detail': module_detail, + } + self.set_health_checks(health_checks) def time_from_string(self, timestr: str) -> datetime.datetime: |