summaryrefslogtreecommitdiffstats
path: root/watchfrr
diff options
context:
space:
mode:
authorDonald Sharp <sharpd@nvidia.com>2022-04-06 15:13:51 +0200
committerDonald Sharp <sharpd@nvidia.com>2022-04-08 21:56:17 +0200
commit6d0fa5c228859850b0b0d3b8e9be1880258b4c7f (patch)
tree4813be73fb10beebd37307b895cfb4f71e72d224 /watchfrr
parentMerge pull request #10982 from donaldsharp/enum_watchfrr (diff)
downloadfrr-6d0fa5c228859850b0b0d3b8e9be1880258b4c7f.tar.xz
frr-6d0fa5c228859850b0b0d3b8e9be1880258b4c7f.zip
watchfrr: Send operational state to systemd
When watchfrr has noticed issues, send operational state to systemd so operators issuing `systemd status frr` can see a more nuanced state of the daemon. Add the `--operational-timeout X` value to the cli. After the daemon has been restarted and communication re-established wait this time before reporting to systemd that the daemon is up and running. Default value of 60 seconds was choosen to allow some small delay in reporting so that, if the daemon is in a crash loop status will not ping pong. Signed-off-by: Donald Sharp <sharpd@nvidia.com>
Diffstat (limited to 'watchfrr')
-rw-r--r--watchfrr/watchfrr.c49
1 files changed, 42 insertions, 7 deletions
diff --git a/watchfrr/watchfrr.c b/watchfrr/watchfrr.c
index 3a5919ede..e5afa6898 100644
--- a/watchfrr/watchfrr.c
+++ b/watchfrr/watchfrr.c
@@ -54,6 +54,7 @@
#define DEFAULT_LOGLEVEL LOG_INFO
#define DEFAULT_MIN_RESTART 60
#define DEFAULT_MAX_RESTART 600
+#define DEFAULT_OPERATIONAL_TIMEOUT 60
#define DEFAULT_RESTART_CMD WATCHFRR_SH_PATH " restart %s"
#define DEFAULT_START_CMD WATCHFRR_SH_PATH " start %s"
@@ -106,12 +107,14 @@ static struct global_state {
enum restart_phase phase;
struct thread *t_phase_hanging;
struct thread *t_startup_timeout;
+ struct thread *t_operational;
const char *vtydir;
long period;
long timeout;
long restart_timeout;
long min_restart_interval;
long max_restart_interval;
+ long operational_timeout;
struct daemon *daemons;
const char *restart_command;
const char *start_command;
@@ -131,6 +134,7 @@ static struct global_state {
.loglevel = DEFAULT_LOGLEVEL,
.min_restart_interval = DEFAULT_MIN_RESTART,
.max_restart_interval = DEFAULT_MAX_RESTART,
+ .operational_timeout = DEFAULT_OPERATIONAL_TIMEOUT,
.restart_command = DEFAULT_RESTART_CMD,
.start_command = DEFAULT_START_CMD,
.stop_command = DEFAULT_STOP_CMD,
@@ -177,6 +181,7 @@ struct daemon {
#define OPTION_MAXRESTART 2001
#define OPTION_DRY 2002
#define OPTION_NETNS 2003
+#define OPTION_MAXOPERATIONAL 2004
static const struct option longopts[] = {
{"daemon", no_argument, NULL, 'd'},
@@ -191,6 +196,7 @@ static const struct option longopts[] = {
{"dry", no_argument, NULL, OPTION_DRY},
{"min-restart-interval", required_argument, NULL, OPTION_MINRESTART},
{"max-restart-interval", required_argument, NULL, OPTION_MAXRESTART},
+ {"operational-timeout", required_argument, NULL, OPTION_MAXOPERATIONAL},
{"pid-file", required_argument, NULL, 'p'},
{"blank-string", required_argument, NULL, 'b'},
#ifdef GNU_LINUX
@@ -265,6 +271,9 @@ Otherwise, the interval is doubled (but capped at the -M value).\n\n",
--max-restart-interval\n\
Set the maximum seconds to wait between invocations of daemon\n\
restart commands (default is %d).\n\
+ --operational-timeout\n\
+ Set the time before systemd is notified that we are considered\n\
+ operational again after a daemon restart (default is %d).\n\
-i, --interval Set the status polling interval in seconds (default is %d)\n\
-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
-T, --restart-timeout\n\
@@ -296,10 +305,10 @@ Otherwise, the interval is doubled (but capped at the -M value).\n\n",
-v, --version Print program version\n\
-h, --help Display this help and exit\n",
frr_vtydir, DEFAULT_LOGLEVEL, LOG_EMERG, LOG_DEBUG, LOG_DEBUG,
- DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART, DEFAULT_PERIOD,
- DEFAULT_TIMEOUT, DEFAULT_RESTART_TIMEOUT,
- DEFAULT_RESTART_CMD, DEFAULT_START_CMD, DEFAULT_STOP_CMD,
- frr_vtydir);
+ DEFAULT_MIN_RESTART, DEFAULT_MAX_RESTART,
+ DEFAULT_OPERATIONAL_TIMEOUT, DEFAULT_PERIOD, DEFAULT_TIMEOUT,
+ DEFAULT_RESTART_TIMEOUT, DEFAULT_RESTART_CMD, DEFAULT_START_CMD,
+ DEFAULT_STOP_CMD, frr_vtydir);
}
static pid_t run_background(char *shell_cmd)
@@ -502,8 +511,6 @@ static int run_job(struct restart_info *restart, const char *cmdtype,
restart->pid = 0;
}
- systemd_send_status("FRR Operational");
-
/* Calculate the new restart interval. */
if (update_interval) {
if (delay.tv_sec > 2 * gs.max_restart_interval)
@@ -584,6 +591,11 @@ static void restart_done(struct daemon *dmn)
SET_WAKEUP_DOWN(dmn);
}
+static void daemon_restarting_operational(struct thread *thread)
+{
+ systemd_send_status("FRR Operational");
+}
+
static void daemon_down(struct daemon *dmn, const char *why)
{
if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
@@ -603,6 +615,8 @@ static void daemon_down(struct daemon *dmn, const char *why)
THREAD_OFF(dmn->t_wakeup);
if (try_connect(dmn) < 0)
SET_WAKEUP_DOWN(dmn);
+
+ systemd_send_status("FRR partially operational");
phase_check();
}
@@ -721,8 +735,15 @@ static void daemon_up(struct daemon *dmn, const char *why)
gs.numdown--;
dmn->connect_tries = 0;
zlog_notice("%s state -> up : %s", dmn->name, why);
- if (gs.numdown == 0)
+ if (gs.numdown == 0) {
daemon_send_ready(0);
+
+ THREAD_OFF(gs.t_operational);
+
+ thread_add_timer(master, daemon_restarting_operational, NULL,
+ gs.operational_timeout, &gs.t_operational);
+ }
+
SET_WAKEUP_ECHO(dmn);
phase_check();
}
@@ -889,6 +910,7 @@ static void phase_check(void)
case PHASE_WAITING_DOWN:
if (gs.numdown + IS_UP(gs.special) < gs.numdaemons)
break;
+ systemd_send_status("Phased Restart");
zlog_info("Phased restart: all routing daemons now down.");
run_job(&gs.special->restart, "restart", gs.restart_command, 1,
1);
@@ -898,6 +920,7 @@ static void phase_check(void)
case PHASE_ZEBRA_RESTART_PENDING:
if (gs.special->restart.pid)
break;
+ systemd_send_status("Zebra Restarting");
zlog_info("Phased restart: %s restart job completed.",
gs.special->name);
set_phase(PHASE_WAITING_ZEBRA_UP);
@@ -1395,6 +1418,18 @@ int main(int argc, char **argv)
frr_help_exit(1);
}
} break;
+ case OPTION_MAXOPERATIONAL: {
+ char garbage[3];
+
+ if ((sscanf(optarg, "%ld%1s", &gs.operational_timeout,
+ garbage) != 1) ||
+ (gs.max_restart_interval < 0)) {
+ fprintf(stderr,
+ "Invalid Operational_timeout argument: %s\n",
+ optarg);
+ frr_help_exit(1);
+ }
+ } break;
case OPTION_NETNS:
netns_en = true;
if (optarg && strchr(optarg, '/')) {