summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorDonald Sharp <sharpd@nvidia.com>2021-02-03 15:13:59 +0100
committerDonald Sharp <sharpd@nvidia.com>2022-01-20 17:56:27 +0100
commitab01a00176db60080047731ab548136e773b6c51 (patch)
treedbfb8df8a6273dbbf259ec19f89e2484508f1921 /lib
parentMerge pull request #10360 from opensourcerouting/clippy-rel-endian (diff)
downloadfrr-ab01a00176db60080047731ab548136e773b6c51.tar.xz
frr-ab01a00176db60080047731ab548136e773b6c51.zip
lib: Figure out if we are being starved for cpu
If a thread timer should have popped CPU_CONSUMED_CHECK seconds in the past, and we are only handling it now. Consider the thread starved and notice it. Signed-off-by: Donald Sharp <sharpd@nvidia.com>
Diffstat (limited to 'lib')
-rw-r--r--lib/lib_errors.c6
-rw-r--r--lib/lib_errors.h1
-rw-r--r--lib/thread.c19
3 files changed, 26 insertions, 0 deletions
diff --git a/lib/lib_errors.c b/lib/lib_errors.c
index a139b9a14..acc9a05c3 100644
--- a/lib/lib_errors.c
+++ b/lib/lib_errors.c
@@ -57,6 +57,12 @@ static struct log_ref ferr_lib_warn[] = {
.suggestion = "Gather log data and open an Issue",
},
{
+ .code = EC_LIB_STARVE_THREAD,
+ .title = "The Event subsystem has detected a thread starvation issue",
+ .description = "The event subsystem has detected a thread starvation issue. This typically indicates that the system FRR is running on is heavily loaded and this load might be impacting FRR's ability to handle events in a timely fashion",
+ .suggestion = "Gather log data and open an Issue",
+ },
+ {
.code = EC_LIB_NO_THREAD,
.title = "The Event subsystem has detected an internal FD problem",
.description = "The Event subsystem has detected a file descriptor read/write event without an associated handling function. This is a bug, please collect log data and open an issue.",
diff --git a/lib/lib_errors.h b/lib/lib_errors.h
index 9f0f58d20..64ac6c1ce 100644
--- a/lib/lib_errors.h
+++ b/lib/lib_errors.h
@@ -46,6 +46,7 @@ enum lib_log_refs {
EC_LIB_LINUX_NS,
EC_LIB_SLOW_THREAD_CPU,
EC_LIB_SLOW_THREAD_WALL,
+ EC_LIB_STARVE_THREAD,
EC_LIB_NO_THREAD,
EC_LIB_RMAP_RECURSION_LIMIT,
EC_LIB_BACKUP_CONFIG,
diff --git a/lib/thread.c b/lib/thread.c
index 77e34f48f..73e0e4887 100644
--- a/lib/thread.c
+++ b/lib/thread.c
@@ -1651,12 +1651,31 @@ static void thread_process_io(struct thread_master *m, unsigned int num)
static unsigned int thread_process_timers(struct thread_master *m,
struct timeval *timenow)
{
+ struct timeval prev = *timenow;
+ bool displayed = false;
struct thread *thread;
unsigned int ready = 0;
while ((thread = thread_timer_list_first(&m->timer))) {
if (timercmp(timenow, &thread->u.sands, <))
break;
+ prev = thread->u.sands;
+ prev.tv_sec += 4;
+ /*
+ * If the timer would have popped 4 seconds in the
+ * past then we are in a situation where we are
+ * really getting behind on handling of events.
+ * Let's log it and do the right thing with it.
+ */
+ if (timercmp(timenow, &prev, >)) {
+ if (!displayed)
+ flog_warn(
+ EC_LIB_STARVE_THREAD,
+ "Thread Starvation: %pTHD was scheduled to pop greater than 4s ago",
+ thread);
+ displayed = true;
+ }
+
thread_timer_list_pop(&m->timer);
thread->type = THREAD_READY;
thread_list_add_tail(&m->ready, thread);