summaryrefslogtreecommitdiffstats
path: root/src/pybind/mgr/crash/module.py
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2021-06-19 18:21:47 +0200
committerSage Weil <sage@newdream.net>2021-06-23 19:00:49 +0200
commit24ac3088b6dcbcfd845ee35ddde382b9c8251053 (patch)
treefd304a9bc2021805217dd7096fb727a32eac0968 /src/pybind/mgr/crash/module.py
parentmgr: generate crash dump for python exceptions (diff)
downloadceph-24ac3088b6dcbcfd845ee35ddde382b9c8251053.tar.xz
ceph-24ac3088b6dcbcfd845ee35ddde382b9c8251053.zip
mgr/crash: separate RECENT_MGR_MODULE_CRASH error for mgr module crashes
Generate a different warning for crashes in mgr module python code, as they do not mean that the entire mgr daemon crashed. Document. Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'src/pybind/mgr/crash/module.py')
-rw-r--r--src/pybind/mgr/crash/module.py60
1 files changed, 45 insertions, 15 deletions
diff --git a/src/pybind/mgr/crash/module.py b/src/pybind/mgr/crash/module.py
index 1cbe2026fb9..739555e6270 100644
--- a/src/pybind/mgr/crash/module.py
+++ b/src/pybind/mgr/crash/module.py
@@ -100,25 +100,55 @@ class Module(MgrModule):
if (self.time_from_string(cast(str, crash['timestamp'])) > cutoff
and 'archived' not in crash)
}
- num = len(recent)
- health_checks: Dict[str, Dict[str, Union[int, str, List[str]]]] = {}
- if recent:
- detail = [
- '%s crashed on host %s at %s' % (
- crash.get('entity_name', 'unidentified daemon'),
- crash.get('utsname_hostname', '(unknown)'),
- crash.get('timestamp', 'unknown time'))
- for crash in recent.values()]
+
+ def prune_detail(ls: List[str]) -> int:
+ num = len(ls)
if num > 30:
- detail = detail[0:30]
- detail.append('and %d more' % (num - 30))
- self.log.debug('detail %s' % detail)
+ ls = ls[0:30]
+ ls.append('and %d more' % (num - 30))
+ return num
+
+ daemon_crashes = []
+ module_crashes = []
+ for c in recent.values():
+ if 'mgr_module' in c:
+ module_crashes.append(c)
+ else:
+ daemon_crashes.append(c)
+ daemon_detail = [
+ '%s crashed on host %s at %s' % (
+ crash.get('entity_name', 'unidentified daemon'),
+ crash.get('utsname_hostname', '(unknown)'),
+ crash.get('timestamp', 'unknown time'))
+ for crash in daemon_crashes]
+ module_detail = [
+ 'mgr module %s crashed in daemon %s on host %s at %s' % (
+ crash.get('mgr_module', 'unidentified module'),
+ crash.get('entity_name', 'unidentified daemon'),
+ crash.get('utsname_hostname', '(unknown)'),
+ crash.get('timestamp', 'unknown time'))
+ for crash in module_crashes]
+ daemon_num = prune_detail(daemon_detail)
+ module_num = prune_detail(module_detail)
+
+ health_checks: Dict[str, Dict[str, Union[int, str, List[str]]]] = {}
+ if daemon_detail:
+ self.log.debug('daemon detail %s' % daemon_detail)
health_checks['RECENT_CRASH'] = {
'severity': 'warning',
- 'summary': '%d daemons have recently crashed' % (num),
- 'count': num,
- 'detail': detail,
+ 'summary': '%d daemons have recently crashed' % (daemon_num),
+ 'count': daemon_num,
+ 'detail': daemon_detail,
}
+ if module_detail:
+ self.log.debug('module detail %s' % module_detail)
+ health_checks['RECENT_MGR_MODULE_CRASH'] = {
+ 'severity': 'warning',
+ 'summary': '%d mgr modules have recently crashed' % (module_num),
+ 'count': module_num,
+ 'detail': module_detail,
+ }
+
self.set_health_checks(health_checks)
def time_from_string(self, timestr: str) -> datetime.datetime: