summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ceph.spec.in1
-rw-r--r--debian/ceph-mgr.install1
-rw-r--r--doc/mgr/alerts.rst58
-rw-r--r--doc/mgr/index.rst1
-rw-r--r--src/pybind/mgr/alerts/__init__.py1
-rw-r--r--src/pybind/mgr/alerts/module.py240
6 files changed, 302 insertions, 0 deletions
diff --git a/ceph.spec.in b/ceph.spec.in
index d9c236e8369..c5128cdbf4d 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -1609,6 +1609,7 @@ fi
%files mgr
%{_bindir}/ceph-mgr
%dir %{_datadir}/ceph/mgr
+%{_datadir}/ceph/mgr/alerts
%{_datadir}/ceph/mgr/ansible
%{_datadir}/ceph/mgr/balancer
%{_datadir}/ceph/mgr/crash
diff --git a/debian/ceph-mgr.install b/debian/ceph-mgr.install
index 4fe43c41d23..aa372cea5cd 100644
--- a/debian/ceph-mgr.install
+++ b/debian/ceph-mgr.install
@@ -1,5 +1,6 @@
lib/systemd/system/ceph-mgr*
usr/bin/ceph-mgr
+usr/share/ceph/mgr/alerts
usr/share/ceph/mgr/ansible
usr/share/ceph/mgr/balancer
usr/share/ceph/mgr/crash
diff --git a/doc/mgr/alerts.rst b/doc/mgr/alerts.rst
new file mode 100644
index 00000000000..319d9d92787
--- /dev/null
+++ b/doc/mgr/alerts.rst
@@ -0,0 +1,58 @@
+Alerts module
+=============
+
+The alerts module can send simple alert messages about cluster health
+via e-mail. In the future, it will support other notification methods
+as well.
+
+:note: This module is *not* intended to be a robust monitoring
+ solution. The fact that it is run as part of the Ceph cluster
+ itself is fundamentally limiting in that a failure of the
+ ceph-mgr daemon prevents alerts from being sent. This module
+ can, however, be useful for standalone clusters that exist in
+ environments where existing monitoring infrastructure does not
+ exist.
+
+Enabling
+--------
+
+The *alerts* module is enabled with::
+
+ ceph mgr module enable alerts
+
+Configuration
+-------------
+
+To configure SMTP, all of the following config options must be set::
+
+ ceph config set mgr mgr/alerts/smtp_host *<smtp-server>*
+ ceph config set mgr mgr/alerts/smtp_destination *<email-address-to-send-to>*
+ ceph config set mgr mgr/alerts/smtp_sender *<from-email-address>*
+
+By default, the module will use SSL and port 465. To change that,::
+
+ ceph config set mgr mgr/alerts/smtp_ssl false # if not SSL
+ ceph config set mgr mgr/alerts/smtp_port *<port-number>* # if not 465
+
+To authenticate to the SMTP server, you must set the user and password::
+
+ ceph config set mgr mgr/alerts/smtp_user *<username>*
+ ceph config set mgr mgr/alerts/smtp_password *<password>*
+
+By default, the name in the ``From:`` line is simply ``Ceph``. To
+change that (e.g., to identify which cluster this is),::
+
+ ceph config set mgr mgr/alerts/smtp_from_name 'Ceph Cluster Foo'
+
+By default, the module will check the cluster health once per minute
+and, if there is a change, send a message. To change that
+frequency,::
+
+ ceph config set mgr mgr/alerts/interval *<interval>* # e.g., "5m" for 5 minutes
+
+Commands
+--------
+
+To force an alert to be send immediately,::
+
+ ceph alerts send
diff --git a/doc/mgr/index.rst b/doc/mgr/index.rst
index 78fefc11f05..6b377d1bd01 100644
--- a/doc/mgr/index.rst
+++ b/doc/mgr/index.rst
@@ -29,6 +29,7 @@ sensible.
Writing modules <modules>
Writing orchestrator plugins <orchestrator_modules>
Dashboard module <dashboard>
+ Alerts module <alerts>
DiskPrediction module <diskprediction>
Local pool module <localpool>
RESTful module <restful>
diff --git a/src/pybind/mgr/alerts/__init__.py b/src/pybind/mgr/alerts/__init__.py
new file mode 100644
index 00000000000..e4c185ca95e
--- /dev/null
+++ b/src/pybind/mgr/alerts/__init__.py
@@ -0,0 +1 @@
+from .module import Alerts
diff --git a/src/pybind/mgr/alerts/module.py b/src/pybind/mgr/alerts/module.py
new file mode 100644
index 00000000000..c063235683c
--- /dev/null
+++ b/src/pybind/mgr/alerts/module.py
@@ -0,0 +1,240 @@
+
+"""
+A simple cluster health alerting module.
+"""
+
+from mgr_module import MgrModule, HandleCommandResult
+from threading import Event
+import errno
+import json
+import smtplib
+
+class Alerts(MgrModule):
+ COMMANDS = [
+ {
+ "cmd": "alerts send",
+ "desc": "(re)send alerts immediately",
+ "perm": "r"
+ },
+ ]
+
+ MODULE_OPTIONS = [
+ {
+ 'name': 'interval',
+ 'type': 'seconds',
+ 'default': 60,
+ 'desc': 'How frequently to reexamine health status',
+ 'runtime': True,
+ },
+ # smtp
+ {
+ 'name': 'smtp_host',
+ 'desc': 'SMTP server',
+ 'runtime': True,
+ },
+ {
+ 'name': 'smtp_destination',
+ 'default': '',
+ 'desc': 'Email address to send alerts to',
+ 'runtime': True,
+ },
+ {
+ 'name': 'smtp_port',
+ 'type': 'int',
+ 'default': 465,
+ 'desc': 'SMTP port',
+ 'runtime': True,
+ },
+ {
+ 'name': 'smtp_ssl',
+ 'type': 'bool',
+ 'default': True,
+ 'desc': 'Use SSL to connect to SMTP server',
+ 'runtime': True,
+ },
+ {
+ 'name': 'smtp_user',
+ 'default': '',
+ 'desc': 'User to authenticate as',
+ 'runtime': True,
+ },
+ {
+ 'name': 'smtp_password',
+ 'default': '',
+ 'desc': 'Password to authenticate with',
+ 'runtime': True,
+ },
+ {
+ 'name': 'smtp_sender',
+ 'default': '',
+ 'desc': 'SMTP envelope sender',
+ 'runtime': True,
+ },
+ {
+ 'name': 'smtp_from_name',
+ 'default': 'Ceph',
+ 'desc': 'Email From: name',
+ 'runtime': True,
+ },
+ ]
+
+ # These are "native" Ceph options that this module cares about.
+ NATIVE_OPTIONS = [
+ ]
+
+ def __init__(self, *args, **kwargs):
+ super(Alerts, self).__init__(*args, **kwargs)
+
+ # set up some members to enable the serve() method and shutdown()
+ self.run = True
+ self.event = Event()
+
+ # ensure config options members are initialized; see config_notify()
+ self.config_notify()
+
+ self.log.info("Init")
+
+
+ def config_notify(self):
+ """
+ This method is called whenever one of our config options is changed.
+ """
+ # This is some boilerplate that stores MODULE_OPTIONS in a class
+ # member, so that, for instance, the 'emphatic' option is always
+ # available as 'self.emphatic'.
+ for opt in self.MODULE_OPTIONS:
+ setattr(self,
+ opt['name'],
+ self.get_module_option(opt['name']) or opt['default'])
+ self.log.debug(' mgr option %s = %s',
+ opt['name'], getattr(self, opt['name']))
+ # Do the same for the native options.
+ for opt in self.NATIVE_OPTIONS:
+ setattr(self,
+ opt,
+ self.get_ceph_option(opt))
+ self.log.debug(' native option %s = %s', opt, getattr(self, opt))
+
+ def handle_command(self, inbuf, cmd):
+ ret = 0
+ out = ''
+ err = ''
+ if cmd['prefix'] == 'alerts send':
+ status = json.loads(self.get('health')['json'])
+ self._send_alert(status, {})
+ return HandleCommandResult(
+ retval=ret, # exit code
+ stdout=out, # stdout
+ stderr=err)
+
+ def _diff(self, last, new):
+ d = {}
+ for code, alert in new.get('checks', {}).items():
+ self.log.debug('new code %s alert %s' % (code, alert))
+ if code not in last.get('checks', {}):
+ if 'new' not in d:
+ d['new'] = {}
+ d['new'][code] = alert
+ elif alert['summary'].get('count', 0) > \
+ last['checks'][code]['summary'].get('count', 0):
+ if 'updated' not in d:
+ d['updated'] = {}
+ d['updated'][code] = alert
+ for code, alert in last.get('checks', {}).items():
+ self.log.debug('old code %s alert %s' % (code, alert))
+ if code not in new.get('checks', {}):
+ if 'cleared' not in d:
+ d['cleared'] = {}
+ d['cleared'][code] = alert
+ return d
+
+ def _send_alert(self, status, diff):
+ if self.smtp_host:
+ self._send_alert_smtp(status, diff)
+
+ def serve(self):
+ """
+ This method is called by the mgr when the module starts and can be
+ used for any background activity.
+ """
+ self.log.info("Starting")
+ last_status = {}
+ while self.run:
+ # Do some useful background work here.
+ new_status = json.loads(self.get('health')['json'])
+ if new_status != last_status:
+ self.log.debug('last_status %s' % last_status)
+ self.log.debug('new_status %s' % new_status)
+ diff = self._diff(last_status,
+ new_status)
+ self.log.debug('diff %s' % diff)
+ if diff:
+ self._send_alert(new_status, diff)
+ last_status = new_status
+
+ self.log.debug('Sleeping for %d seconds', self.interval)
+ ret = self.event.wait(self.interval)
+ self.event.clear()
+
+ def shutdown(self):
+ """
+ This method is called by the mgr when the module needs to shut
+ down (i.e., when the serve() function needs to exit).
+ """
+ self.log.info('Stopping')
+ self.run = False
+ self.event.set()
+
+ # SMTP
+ def _smtp_format_alert(self, code, alert):
+ r = '[{sev}] {code}: {summary}\n'.format(
+ code=code,
+ sev=alert['severity'].split('_')[1],
+ summary=alert['summary']['message'])
+ for detail in alert['detail']:
+ r += ' {message}\n'.format(
+ message=detail['message'])
+ return r
+
+ def _send_alert_smtp(self, status, diff):
+ # message
+ self.log.debug('_send_alert_smtp')
+ message = ('From: {from_name} <{sender}>\n'
+ 'Subject: {status}\n'
+ 'To: {target}\n'
+ '\n'
+ '{status}\n'.format(
+ sender=self.smtp_sender,
+ from_name=self.smtp_from_name,
+ status=status['status'],
+ target=self.smtp_destination))
+
+ if 'new' in diff:
+ message += ('\n--- New ---\n')
+ for code, alert in diff['new'].items():
+ message += self._smtp_format_alert(code, alert)
+ if 'updated' in diff:
+ message += ('\n--- Updated ---\n')
+ for code, alert in diff['updated'].items():
+ message += self._smtp_format_alert(code, alert)
+ if 'cleared' in diff:
+ message += ('\n--- Cleared ---\n')
+ for code, alert in diff['cleared'].items():
+ message += self._smtp_format_alert(code, alert)
+
+ message += ('\n\n=== Full health status ===\n')
+ for code, alert in status['checks'].items():
+ message += self._smtp_format_alert(code, alert)
+
+ self.log.debug('message: %s' % message)
+
+ # send
+ if self.smtp_ssl:
+ server = smtplib.SMTP_SSL(self.smtp_host, self.smtp_port)
+ else:
+ server = smtplib.SMTP(self.smtp_host, self.smtp_port)
+ if self.smtp_password:
+ server.login(self.smtp_user, self.smtp_password)
+ server.sendmail(self.smtp_sender, self.smtp_destination, message)
+ server.quit()
+ self.log.debug('Sent email to %s' % self.smtp_destination)