Merge tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "Scheduler SMP load-balancer improvements: - Avoid unnecessary migrations within SMT domains on hybrid systems. Problem: On hybrid CPU systems, (processors with a mixture of higher-frequency SMT cores and lower-frequency non-SMT cores), under the old code lower-priority CPUs pulled tasks from the higher-priority cores if more than one SMT sibling was busy - resulting in many unnecessary task migrations. Solution: The new code improves the load balancer to recognize SMT cores with more than one busy sibling and allows lower-priority CPUs to pull tasks, which avoids superfluous migrations and lets lower-priority cores inspect all SMT siblings for the busiest queue. - Implement the 'runnable boosting' feature in the EAS balancer: consider CPU contention in frequency, EAS max util & load-balance busiest CPU selection. This improves CPU utilization for certain workloads, while leaves other key workloads unchanged. Scheduler infrastructure improvements: - Rewrite the scheduler topology setup code by consolidating it into the build_sched_topology() helper function and building it dynamically on the fly. - Resolve the local_clock() vs. noinstr complications by rewriting the code: provide separate sched_clock_noinstr() and local_clock_noinstr() functions to be used in instrumentation code, and make sure it is all instrumentation-safe. Fixes: - Fix a kthread_park() race with wait_woken() - Fix misc wait_task_inactive() bugs unearthed by the -rt merge: - Fix UP PREEMPT bug by unifying the SMP and UP implementations - Fix task_struct::saved_state handling - Fix various rq clock update bugs, unearthed by turning on the rq clock debugging code. - Fix the PSI WINDOW_MIN_US trigger limit, which was easy to trigger by creating enough cgroups, by removing the warnign and restricting window size triggers to PSI file write-permission or CAP_SYS_RESOURCE. - Propagate SMT flags in the topology when removing degenerate domain - Fix grub_reclaim() calculation bug in the deadline scheduler code - Avoid resetting the min update period when it is unnecessary, in psi_trigger_destroy(). - Don't balance a task to its current running CPU in load_balance(), which was possible on certain NUMA topologies with overlapping groups. - Fix the sched-debug printing of rq->nr_uninterruptible Cleanups: - Address various -Wmissing-prototype warnings, as a preparation to (maybe) enable this warning in the future. - Remove unused code - Mark more functions __init - Fix shadow-variable warnings" * tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits) sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle() sched/core: Avoid double calling update_rq_clock() in __balance_push_cpu_stop() sched/core: Fixed missing rq clock update before calling set_rq_offline() sched/deadline: Update GRUB description in the documentation sched/deadline: Fix bandwidth reclaim equation in GRUB sched/wait: Fix a kthread_park race with wait_woken() sched/topology: Mark set_sched_topology() __init sched/fair: Rename variable cpu_util eff_util arm64/arch_timer: Fix MMIO byteswap sched/fair, cpufreq: Introduce 'runnable boosting' sched/fair: Refactor CPU utilization functions cpuidle: Use local_clock_noinstr() sched/clock: Provide local_clock_noinstr() x86/tsc: Provide sched_clock_noinstr() clocksource: hyper-v: Provide noinstr sched_clock() clocksource: hyper-v: Adjust hv_read_tsc_page_tsc() to avoid special casing U64_MAX x86/vdso: Fix gettimeofday masking math64: Always inline u128 version of mul_u64_u64_shr() s390/time: Provide sched_clock_noinstr() loongarch: Provide noinstr sched_clock_read() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-06-27 23:03:21 +0200
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-06-27 23:03:21 +0200
commit: ed3b7923a816ded62dccef377c9ee346c7d3b1b4 (patch)
tree: 41d46fc399c231088a370e7b3e488a93342fa681 /drivers/clocksource/hyperv_timer.c
parent: Merge tag 'x86_sgx_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parent: sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle() (diff)
download: linux-ed3b7923a816ded62dccef377c9ee346c7d3b1b4.tar.xz
linux-ed3b7923a816ded62dccef377c9ee346c7d3b1b4.zip
1 files changed, 26 insertions, 16 deletions
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 9fc008c16636..e56307a81f4d 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -365,6 +365,20 @@ void hv_stimer_global_cleanup(void)
 }
 EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);
 
+static __always_inline u64 read_hv_clock_msr(void)
+{
+	/*
+	 * Read the partition counter to get the current tick count. This count
+	 * is set to 0 when the partition is created and is incremented in 100
+	 * nanosecond units.
+	 *
+	 * Use hv_raw_get_register() because this function is used from
+	 * noinstr. Notable; while HV_REGISTER_TIME_REF_COUNT is a synthetic
+	 * register it doesn't need the GHCB path.
+	 */
+	return hv_raw_get_register(HV_REGISTER_TIME_REF_COUNT);
+}
+
 /*
  * Code and definitions for the Hyper-V clocksources.  Two
  * clocksources are defined: one that reads the Hyper-V defined MSR, and
@@ -393,14 +407,20 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 }
 EXPORT_SYMBOL_GPL(hv_get_tsc_page);
 
-static u64 notrace read_hv_clock_tsc(void)
+static __always_inline u64 read_hv_clock_tsc(void)
 {
-	u64 current_tick = hv_read_tsc_page(hv_get_tsc_page());
+	u64 cur_tsc, time;
 
-	if (current_tick == U64_MAX)
-		current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT);
+	/*
+	 * The Hyper-V Top-Level Function Spec (TLFS), section Timers,
+	 * subsection Refererence Counter, guarantees that the TSC and MSR
+	 * times are in sync and monotonic. Therefore we can fall back
+	 * to the MSR in case the TSC page indicates unavailability.
+	 */
+	if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time))
+		time = read_hv_clock_msr();
 
-	return current_tick;
+	return time;
 }
 
 static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg)
@@ -408,7 +428,7 @@ static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg)
 	return read_hv_clock_tsc();
 }
 
-static u64 notrace read_hv_sched_clock_tsc(void)
+static u64 noinstr read_hv_sched_clock_tsc(void)
 {
 	return (read_hv_clock_tsc() - hv_sched_clock_offset) *
 		(NSEC_PER_SEC / HV_CLOCK_HZ);
@@ -460,16 +480,6 @@ static struct clocksource hyperv_cs_tsc = {
 #endif
 };
 
-static u64 notrace read_hv_clock_msr(void)
-{
-	/*
-	 * Read the partition counter to get the current tick count. This count
-	 * is set to 0 when the partition is created and is incremented in
-	 * 100 nanosecond units.
-	 */
-	return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
-}
-
 static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg)
 {
 	return read_hv_clock_msr();
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-06-27 23:03:21 +0200
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-06-27 23:03:21 +0200
commit	ed3b7923a816ded62dccef377c9ee346c7d3b1b4 (patch)
tree	41d46fc399c231088a370e7b3e488a93342fa681 /drivers/clocksource/hyperv_timer.c
parent	Merge tag 'x86_sgx_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parent	sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle() (diff)
download	linux-ed3b7923a816ded62dccef377c9ee346c7d3b1b4.tar.xz linux-ed3b7923a816ded62dccef377c9ee346c7d3b1b4.zip