diff options
-rw-r--r-- | ceph.spec.in | 1 | ||||
-rw-r--r-- | debian/ceph-mgr.install | 1 | ||||
-rw-r--r-- | doc/mgr/alerts.rst | 58 | ||||
-rw-r--r-- | doc/mgr/index.rst | 1 | ||||
-rw-r--r-- | src/pybind/mgr/alerts/__init__.py | 1 | ||||
-rw-r--r-- | src/pybind/mgr/alerts/module.py | 240 |
6 files changed, 302 insertions, 0 deletions
diff --git a/ceph.spec.in b/ceph.spec.in index d9c236e8369..c5128cdbf4d 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -1609,6 +1609,7 @@ fi %files mgr %{_bindir}/ceph-mgr %dir %{_datadir}/ceph/mgr +%{_datadir}/ceph/mgr/alerts %{_datadir}/ceph/mgr/ansible %{_datadir}/ceph/mgr/balancer %{_datadir}/ceph/mgr/crash diff --git a/debian/ceph-mgr.install b/debian/ceph-mgr.install index 4fe43c41d23..aa372cea5cd 100644 --- a/debian/ceph-mgr.install +++ b/debian/ceph-mgr.install @@ -1,5 +1,6 @@ lib/systemd/system/ceph-mgr* usr/bin/ceph-mgr +usr/share/ceph/mgr/alerts usr/share/ceph/mgr/ansible usr/share/ceph/mgr/balancer usr/share/ceph/mgr/crash diff --git a/doc/mgr/alerts.rst b/doc/mgr/alerts.rst new file mode 100644 index 00000000000..319d9d92787 --- /dev/null +++ b/doc/mgr/alerts.rst @@ -0,0 +1,58 @@ +Alerts module +============= + +The alerts module can send simple alert messages about cluster health +via e-mail. In the future, it will support other notification methods +as well. + +:note: This module is *not* intended to be a robust monitoring + solution. The fact that it is run as part of the Ceph cluster + itself is fundamentally limiting in that a failure of the + ceph-mgr daemon prevents alerts from being sent. This module + can, however, be useful for standalone clusters that exist in + environments where existing monitoring infrastructure does not + exist. + +Enabling +-------- + +The *alerts* module is enabled with:: + + ceph mgr module enable alerts + +Configuration +------------- + +To configure SMTP, all of the following config options must be set:: + + ceph config set mgr mgr/alerts/smtp_host *<smtp-server>* + ceph config set mgr mgr/alerts/smtp_destination *<email-address-to-send-to>* + ceph config set mgr mgr/alerts/smtp_sender *<from-email-address>* + +By default, the module will use SSL and port 465. To change that,:: + + ceph config set mgr mgr/alerts/smtp_ssl false # if not SSL + ceph config set mgr mgr/alerts/smtp_port *<port-number>* # if not 465 + +To authenticate to the SMTP server, you must set the user and password:: + + ceph config set mgr mgr/alerts/smtp_user *<username>* + ceph config set mgr mgr/alerts/smtp_password *<password>* + +By default, the name in the ``From:`` line is simply ``Ceph``. To +change that (e.g., to identify which cluster this is),:: + + ceph config set mgr mgr/alerts/smtp_from_name 'Ceph Cluster Foo' + +By default, the module will check the cluster health once per minute +and, if there is a change, send a message. To change that +frequency,:: + + ceph config set mgr mgr/alerts/interval *<interval>* # e.g., "5m" for 5 minutes + +Commands +-------- + +To force an alert to be send immediately,:: + + ceph alerts send diff --git a/doc/mgr/index.rst b/doc/mgr/index.rst index 78fefc11f05..6b377d1bd01 100644 --- a/doc/mgr/index.rst +++ b/doc/mgr/index.rst @@ -29,6 +29,7 @@ sensible. Writing modules <modules> Writing orchestrator plugins <orchestrator_modules> Dashboard module <dashboard> + Alerts module <alerts> DiskPrediction module <diskprediction> Local pool module <localpool> RESTful module <restful> diff --git a/src/pybind/mgr/alerts/__init__.py b/src/pybind/mgr/alerts/__init__.py new file mode 100644 index 00000000000..e4c185ca95e --- /dev/null +++ b/src/pybind/mgr/alerts/__init__.py @@ -0,0 +1 @@ +from .module import Alerts diff --git a/src/pybind/mgr/alerts/module.py b/src/pybind/mgr/alerts/module.py new file mode 100644 index 00000000000..c063235683c --- /dev/null +++ b/src/pybind/mgr/alerts/module.py @@ -0,0 +1,240 @@ + +""" +A simple cluster health alerting module. +""" + +from mgr_module import MgrModule, HandleCommandResult +from threading import Event +import errno +import json +import smtplib + +class Alerts(MgrModule): + COMMANDS = [ + { + "cmd": "alerts send", + "desc": "(re)send alerts immediately", + "perm": "r" + }, + ] + + MODULE_OPTIONS = [ + { + 'name': 'interval', + 'type': 'seconds', + 'default': 60, + 'desc': 'How frequently to reexamine health status', + 'runtime': True, + }, + # smtp + { + 'name': 'smtp_host', + 'desc': 'SMTP server', + 'runtime': True, + }, + { + 'name': 'smtp_destination', + 'default': '', + 'desc': 'Email address to send alerts to', + 'runtime': True, + }, + { + 'name': 'smtp_port', + 'type': 'int', + 'default': 465, + 'desc': 'SMTP port', + 'runtime': True, + }, + { + 'name': 'smtp_ssl', + 'type': 'bool', + 'default': True, + 'desc': 'Use SSL to connect to SMTP server', + 'runtime': True, + }, + { + 'name': 'smtp_user', + 'default': '', + 'desc': 'User to authenticate as', + 'runtime': True, + }, + { + 'name': 'smtp_password', + 'default': '', + 'desc': 'Password to authenticate with', + 'runtime': True, + }, + { + 'name': 'smtp_sender', + 'default': '', + 'desc': 'SMTP envelope sender', + 'runtime': True, + }, + { + 'name': 'smtp_from_name', + 'default': 'Ceph', + 'desc': 'Email From: name', + 'runtime': True, + }, + ] + + # These are "native" Ceph options that this module cares about. + NATIVE_OPTIONS = [ + ] + + def __init__(self, *args, **kwargs): + super(Alerts, self).__init__(*args, **kwargs) + + # set up some members to enable the serve() method and shutdown() + self.run = True + self.event = Event() + + # ensure config options members are initialized; see config_notify() + self.config_notify() + + self.log.info("Init") + + + def config_notify(self): + """ + This method is called whenever one of our config options is changed. + """ + # This is some boilerplate that stores MODULE_OPTIONS in a class + # member, so that, for instance, the 'emphatic' option is always + # available as 'self.emphatic'. + for opt in self.MODULE_OPTIONS: + setattr(self, + opt['name'], + self.get_module_option(opt['name']) or opt['default']) + self.log.debug(' mgr option %s = %s', + opt['name'], getattr(self, opt['name'])) + # Do the same for the native options. + for opt in self.NATIVE_OPTIONS: + setattr(self, + opt, + self.get_ceph_option(opt)) + self.log.debug(' native option %s = %s', opt, getattr(self, opt)) + + def handle_command(self, inbuf, cmd): + ret = 0 + out = '' + err = '' + if cmd['prefix'] == 'alerts send': + status = json.loads(self.get('health')['json']) + self._send_alert(status, {}) + return HandleCommandResult( + retval=ret, # exit code + stdout=out, # stdout + stderr=err) + + def _diff(self, last, new): + d = {} + for code, alert in new.get('checks', {}).items(): + self.log.debug('new code %s alert %s' % (code, alert)) + if code not in last.get('checks', {}): + if 'new' not in d: + d['new'] = {} + d['new'][code] = alert + elif alert['summary'].get('count', 0) > \ + last['checks'][code]['summary'].get('count', 0): + if 'updated' not in d: + d['updated'] = {} + d['updated'][code] = alert + for code, alert in last.get('checks', {}).items(): + self.log.debug('old code %s alert %s' % (code, alert)) + if code not in new.get('checks', {}): + if 'cleared' not in d: + d['cleared'] = {} + d['cleared'][code] = alert + return d + + def _send_alert(self, status, diff): + if self.smtp_host: + self._send_alert_smtp(status, diff) + + def serve(self): + """ + This method is called by the mgr when the module starts and can be + used for any background activity. + """ + self.log.info("Starting") + last_status = {} + while self.run: + # Do some useful background work here. + new_status = json.loads(self.get('health')['json']) + if new_status != last_status: + self.log.debug('last_status %s' % last_status) + self.log.debug('new_status %s' % new_status) + diff = self._diff(last_status, + new_status) + self.log.debug('diff %s' % diff) + if diff: + self._send_alert(new_status, diff) + last_status = new_status + + self.log.debug('Sleeping for %d seconds', self.interval) + ret = self.event.wait(self.interval) + self.event.clear() + + def shutdown(self): + """ + This method is called by the mgr when the module needs to shut + down (i.e., when the serve() function needs to exit). + """ + self.log.info('Stopping') + self.run = False + self.event.set() + + # SMTP + def _smtp_format_alert(self, code, alert): + r = '[{sev}] {code}: {summary}\n'.format( + code=code, + sev=alert['severity'].split('_')[1], + summary=alert['summary']['message']) + for detail in alert['detail']: + r += ' {message}\n'.format( + message=detail['message']) + return r + + def _send_alert_smtp(self, status, diff): + # message + self.log.debug('_send_alert_smtp') + message = ('From: {from_name} <{sender}>\n' + 'Subject: {status}\n' + 'To: {target}\n' + '\n' + '{status}\n'.format( + sender=self.smtp_sender, + from_name=self.smtp_from_name, + status=status['status'], + target=self.smtp_destination)) + + if 'new' in diff: + message += ('\n--- New ---\n') + for code, alert in diff['new'].items(): + message += self._smtp_format_alert(code, alert) + if 'updated' in diff: + message += ('\n--- Updated ---\n') + for code, alert in diff['updated'].items(): + message += self._smtp_format_alert(code, alert) + if 'cleared' in diff: + message += ('\n--- Cleared ---\n') + for code, alert in diff['cleared'].items(): + message += self._smtp_format_alert(code, alert) + + message += ('\n\n=== Full health status ===\n') + for code, alert in status['checks'].items(): + message += self._smtp_format_alert(code, alert) + + self.log.debug('message: %s' % message) + + # send + if self.smtp_ssl: + server = smtplib.SMTP_SSL(self.smtp_host, self.smtp_port) + else: + server = smtplib.SMTP(self.smtp_host, self.smtp_port) + if self.smtp_password: + server.login(self.smtp_user, self.smtp_password) + server.sendmail(self.smtp_sender, self.smtp_destination, message) + server.quit() + self.log.debug('Sent email to %s' % self.smtp_destination) |