diff options
40 files changed, 1333 insertions, 713 deletions
@@ -207,6 +207,10 @@ Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Patrick Mochel <mochel@digitalimplant.org> Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com> Paul Burton <paulburton@kernel.org> <paul.burton@mips.com> +Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.ibm.com> +Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.vnet.ibm.com> +Paul E. McKenney <paulmck@kernel.org> <paul.mckenney@linaro.org> +Paul E. McKenney <paulmck@kernel.org> <paulmck@us.ibm.com> Peter A Jonsson <pj@ludd.ltu.se> Peter Oruba <peter@oruba.de> Peter Oruba <peter.oruba@amd.com> diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.rst index 881353fd5bff..180958388ff9 100644 --- a/Documentation/RCU/NMI-RCU.txt +++ b/Documentation/RCU/NMI-RCU.rst @@ -1,4 +1,7 @@ +.. _NMI_rcu_doc: + Using RCU to Protect Dynamic NMI Handlers +========================================= Although RCU is usually used to protect read-mostly data structures, @@ -9,7 +12,7 @@ work in "arch/x86/oprofile/nmi_timer_int.c" and in "arch/x86/kernel/traps.c". The relevant pieces of code are listed below, each followed by a -brief explanation. +brief explanation:: static int dummy_nmi_callback(struct pt_regs *regs, int cpu) { @@ -18,12 +21,12 @@ brief explanation. The dummy_nmi_callback() function is a "dummy" NMI handler that does nothing, but returns zero, thus saying that it did nothing, allowing -the NMI handler to take the default machine-specific action. +the NMI handler to take the default machine-specific action:: static nmi_callback_t nmi_callback = dummy_nmi_callback; This nmi_callback variable is a global function pointer to the current -NMI handler. +NMI handler:: void do_nmi(struct pt_regs * regs, long error_code) { @@ -53,11 +56,12 @@ anyway. However, in practice it is a good documentation aid, particularly for anyone attempting to do something similar on Alpha or on systems with aggressive optimizing compilers. -Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha, - given that the code referenced by the pointer is read-only? +Quick Quiz: + Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only? +:ref:`Answer to Quick Quiz <answer_quick_quiz_NMI>` -Back to the discussion of NMI and RCU... +Back to the discussion of NMI and RCU:: void set_nmi_callback(nmi_callback_t callback) { @@ -68,7 +72,7 @@ The set_nmi_callback() function registers an NMI handler. Note that any data that is to be used by the callback must be initialized up -before- the call to set_nmi_callback(). On architectures that do not order writes, the rcu_assign_pointer() ensures that the NMI handler sees the -initialized values. +initialized values:: void unset_nmi_callback(void) { @@ -82,7 +86,7 @@ up any data structures used by the old NMI handler until execution of it completes on all other CPUs. One way to accomplish this is via synchronize_rcu(), perhaps as -follows: +follows:: unset_nmi_callback(); synchronize_rcu(); @@ -98,24 +102,23 @@ to free up the handler's data as soon as synchronize_rcu() returns. Important note: for this to work, the architecture in question must invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively. +.. _answer_quick_quiz_NMI: -Answer to Quick Quiz - - Why might the rcu_dereference_sched() be necessary on Alpha, given - that the code referenced by the pointer is read-only? +Answer to Quick Quiz: + Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only? - Answer: The caller to set_nmi_callback() might well have - initialized some data that is to be used by the new NMI - handler. In this case, the rcu_dereference_sched() would - be needed, because otherwise a CPU that received an NMI - just after the new handler was set might see the pointer - to the new NMI handler, but the old pre-initialized - version of the handler's data. + The caller to set_nmi_callback() might well have + initialized some data that is to be used by the new NMI + handler. In this case, the rcu_dereference_sched() would + be needed, because otherwise a CPU that received an NMI + just after the new handler was set might see the pointer + to the new NMI handler, but the old pre-initialized + version of the handler's data. - This same sad story can happen on other CPUs when using - a compiler with aggressive pointer-value speculation - optimizations. + This same sad story can happen on other CPUs when using + a compiler with aggressive pointer-value speculation + optimizations. - More important, the rcu_dereference_sched() makes it - clear to someone reading the code that the pointer is - being protected by RCU-sched. + More important, the rcu_dereference_sched() makes it + clear to someone reading the code that the pointer is + being protected by RCU-sched. diff --git a/Documentation/RCU/arrayRCU.txt b/Documentation/RCU/arrayRCU.rst index f05a9afb2c39..4051ea3871ef 100644 --- a/Documentation/RCU/arrayRCU.txt +++ b/Documentation/RCU/arrayRCU.rst @@ -1,19 +1,21 @@ -Using RCU to Protect Read-Mostly Arrays +.. _array_rcu_doc: +Using RCU to Protect Read-Mostly Arrays +======================================= Although RCU is more commonly used to protect linked lists, it can also be used to protect arrays. Three situations are as follows: -1. Hash Tables +1. :ref:`Hash Tables <hash_tables>` -2. Static Arrays +2. :ref:`Static Arrays <static_arrays>` -3. Resizeable Arrays +3. :ref:`Resizable Arrays <resizable_arrays>` Each of these three situations involves an RCU-protected pointer to an array that is separately indexed. It might be tempting to consider use of RCU to instead protect the index into an array, however, this use -case is -not- supported. The problem with RCU-protected indexes into +case is **not** supported. The problem with RCU-protected indexes into arrays is that compilers can play way too many optimization games with integers, which means that the rules governing handling of these indexes are far more trouble than they are worth. If RCU-protected indexes into @@ -24,16 +26,20 @@ to be safely used. That aside, each of the three RCU-protected pointer situations are described in the following sections. +.. _hash_tables: Situation 1: Hash Tables +------------------------ Hash tables are often implemented as an array, where each array entry has a linked-list hash chain. Each hash chain can be protected by RCU as described in the listRCU.txt document. This approach also applies to other array-of-list situations, such as radix trees. +.. _static_arrays: Situation 2: Static Arrays +-------------------------- Static arrays, where the data (rather than a pointer to the data) is located in each array element, and where the array is never resized, @@ -41,13 +47,17 @@ have not been used with RCU. Rik van Riel recommends using seqlock in this situation, which would also have minimal read-side overhead as long as updates are rare. -Quick Quiz: Why is it so important that updates be rare when - using seqlock? +Quick Quiz: + Why is it so important that updates be rare when using seqlock? + +:ref:`Answer to Quick Quiz <answer_quick_quiz_seqlock>` +.. _resizable_arrays: -Situation 3: Resizeable Arrays +Situation 3: Resizable Arrays +------------------------------ -Use of RCU for resizeable arrays is demonstrated by the grow_ary() +Use of RCU for resizable arrays is demonstrated by the grow_ary() function formerly used by the System V IPC code. The array is used to map from semaphore, message-queue, and shared-memory IDs to the data structure that represents the corresponding IPC construct. The grow_ary() @@ -60,7 +70,7 @@ the remainder of the new, updates the ids->entries pointer to point to the new array, and invokes ipc_rcu_putref() to free up the old array. Note that rcu_assign_pointer() is used to update the ids->entries pointer, which includes any memory barriers required on whatever architecture -you are running on. +you are running on:: static int grow_ary(struct ipc_ids* ids, int newsize) { @@ -112,7 +122,7 @@ a simple check suffices. The pointer to the structure corresponding to the desired IPC object is placed in "out", with NULL indicating a non-existent entry. After acquiring "out->lock", the "out->deleted" flag indicates whether the IPC object is in the process of being -deleted, and, if not, the pointer is returned. +deleted, and, if not, the pointer is returned:: struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id) { @@ -144,8 +154,10 @@ deleted, and, if not, the pointer is returned. return out; } +.. _answer_quick_quiz_seqlock: Answer to Quick Quiz: + Why is it so important that updates be rare when using seqlock? The reason that it is important that updates be rare when using seqlock is that frequent updates can livelock readers. diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 5c99185710fa..81a0a1e5f767 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -7,8 +7,13 @@ RCU concepts .. toctree:: :maxdepth: 3 + arrayRCU + rcubarrier + rcu_dereference + whatisRCU rcu listRCU + NMI-RCU UP Design/Memory-Ordering/Tree-RCU-Memory-Ordering diff --git a/Documentation/RCU/lockdep-splat.txt b/Documentation/RCU/lockdep-splat.txt index 9c015976b174..b8096316fd11 100644 --- a/Documentation/RCU/lockdep-splat.txt +++ b/Documentation/RCU/lockdep-splat.txt @@ -99,7 +99,7 @@ With this change, the rcu_dereference() is always within an RCU read-side critical section, which again would have suppressed the above lockdep-RCU splat. -But in this particular case, we don't actually deference the pointer +But in this particular case, we don't actually dereference the pointer returned from rcu_dereference(). Instead, that pointer is just compared to the cic pointer, which means that the rcu_dereference() can be replaced by rcu_access_pointer() as follows: diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.rst index bf699e8cfc75..c9667eb0d444 100644 --- a/Documentation/RCU/rcu_dereference.txt +++ b/Documentation/RCU/rcu_dereference.rst @@ -1,4 +1,7 @@ +.. _rcu_dereference_doc: + PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference() +=============================================================== Most of the time, you can use values from rcu_dereference() or one of the similar primitives without worries. Dereferencing (prefix "*"), @@ -8,7 +11,7 @@ subtraction of constants, and casts all work quite naturally and safely. It is nevertheless possible to get into trouble with other operations. Follow these rules to keep your RCU code working properly: -o You must use one of the rcu_dereference() family of primitives +- You must use one of the rcu_dereference() family of primitives to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU will complain. Worse yet, your code can see random memory-corruption bugs due to games that compilers and DEC Alpha can play. @@ -25,24 +28,24 @@ o You must use one of the rcu_dereference() family of primitives for an example where the compiler can in fact deduce the exact value of the pointer, and thus cause misordering. -o You are only permitted to use rcu_dereference on pointer values. +- You are only permitted to use rcu_dereference on pointer values. The compiler simply knows too much about integral values to trust it to carry dependencies through integer operations. There are a very few exceptions, namely that you can temporarily cast the pointer to uintptr_t in order to: - o Set bits and clear bits down in the must-be-zero low-order + - Set bits and clear bits down in the must-be-zero low-order bits of that pointer. This clearly means that the pointer must have alignment constraints, for example, this does -not- work in general for char* pointers. - o XOR bits to translate pointers, as is done in some + - XOR bits to translate pointers, as is done in some classic buddy-allocator algorithms. It is important to cast the value back to pointer before doing much of anything else with it. -o Avoid cancellation when using the "+" and "-" infix arithmetic +- Avoid cancellation when using the "+" and "-" infix arithmetic operators. For example, for a given variable "x", avoid "(x-(uintptr_t)x)" for char* pointers. The compiler is within its rights to substitute zero for this sort of expression, so that @@ -54,16 +57,16 @@ o Avoid cancellation when using the "+" and "-" infix arithmetic "p+a-b" is safe because its value still necessarily depends on the rcu_dereference(), thus maintaining proper ordering. -o If you are using RCU to protect JITed functions, so that the +- If you are using RCU to protect JITed functions, so that the "()" function-invocation operator is applied to a value obtained (directly or indirectly) from rcu_dereference(), you may need to interact directly with the hardware to flush instruction caches. This issue arises on some systems when a newly JITed function is using the same memory that was used by an earlier JITed function. -o Do not use the results from relational operators ("==", "!=", +- Do not use the results from relational operators ("==", "!=", ">", ">=", "<", or "<=") when dereferencing. For example, - the following (quite strange) code is buggy: + the following (quite strange) code is buggy:: int *p; int *q; @@ -81,11 +84,11 @@ o Do not use the results from relational operators ("==", "!=", after such branches, but can speculate loads, which can again result in misordering bugs. -o Be very careful about comparing pointers obtained from +- Be very careful about comparing pointers obtained from rcu_dereference() against non-NULL values. As Linus Torvalds explained, if the two pointers are equal, the compiler could substitute the pointer you are comparing against for the pointer - obtained from rcu_dereference(). For example: + obtained from rcu_dereference(). For example:: p = rcu_dereference(gp); if (p == &default_struct) @@ -93,7 +96,7 @@ o Be very careful about comparing pointers obtained from Because the compiler now knows that the value of "p" is exactly the address of the variable "default_struct", it is free to - transform this code into the following: + transform this code into the following:: p = rcu_dereference(gp); if (p == &default_struct) @@ -105,14 +108,14 @@ o Be very careful about comparing pointers obtained from However, comparisons are OK in the following cases: - o The comparison was against the NULL pointer. If the + - The comparison was against the NULL pointer. If the compiler knows that the pointer is NULL, you had better not be dereferencing it anyway. If the comparison is non-equal, the compiler is none the wiser. Therefore, it is safe to compare pointers from rcu_dereference() against NULL pointers. - o The pointer is never dereferenced after being compared. + - The pointer is never dereferenced after being compared. Since there are no subsequent dereferences, the compiler cannot use anything it learned from the comparison to reorder the non-existent subsequent dereferences. @@ -124,31 +127,31 @@ o Be very careful about comparing pointers obtained from dereferenced, rcu_access_pointer() should be used in place of rcu_dereference(). - o The comparison is against a pointer that references memory + - The comparison is against a pointer that references memory that was initialized "a long time ago." The reason this is safe is that even if misordering occurs, the misordering will not affect the accesses that follow the comparison. So exactly how long ago is "a long time ago"? Here are some possibilities: - o Compile time. + - Compile time. - o Boot time. + - Boot time. - o Module-init time for module code. + - Module-init time for module code. - o Prior to kthread creation for kthread code. + - Prior to kthread creation for kthread code. - o During some prior acquisition of the lock that + - During some prior acquisition of the lock that we now hold. - o Before mod_timer() time for a timer handler. + - Before mod_timer() time for a timer handler. There are many other possibilities involving the Linux kernel's wide array of primitives that cause code to be invoked at a later time. - o The pointer being compared against also came from + - The pointer being compared against also came from rcu_dereference(). In this case, both pointers depend on one rcu_dereference() or another, so you get proper ordering either way. @@ -159,13 +162,13 @@ o Be very careful about comparing pointers obtained from of such an RCU usage bug is shown in the section titled "EXAMPLE OF AMPLIFIED RCU-USAGE BUG". - o All of the accesses following the comparison are stores, + - All of the accesses following the comparison are stores, so that a control dependency preserves the needed ordering. That said, it is easy to get control dependencies wrong. Please see the "CONTROL DEPENDENCIES" section of Documentation/memory-barriers.txt for more details. - o The pointers are not equal -and- the compiler does + - The pointers are not equal -and- the compiler does not have enough information to deduce the value of the pointer. Note that the volatile cast in rcu_dereference() will normally prevent the compiler from knowing too much. @@ -175,7 +178,7 @@ o Be very careful about comparing pointers obtained from comparison will provide exactly the information that the compiler needs to deduce the value of the pointer. -o Disable any value-speculation optimizations that your compiler +- Disable any value-speculation optimizations that your compiler might provide, especially if you are making use of feedback-based optimizations that take data collected from prior runs. Such value-speculation optimizations reorder operations by design. @@ -188,11 +191,12 @@ o Disable any value-speculation optimizations that your compiler EXAMPLE OF AMPLIFIED RCU-USAGE BUG +---------------------------------- Because updaters can run concurrently with RCU readers, RCU readers can see stale and/or inconsistent values. If RCU readers need fresh or consistent values, which they sometimes do, they need to take proper -precautions. To see this, consider the following code fragment: +precautions. To see this, consider the following code fragment:: struct foo { int a; @@ -244,7 +248,7 @@ to some reordering from the compiler and CPUs is beside the point. But suppose that the reader needs a consistent view? -Then one approach is to use locking, for example, as follows: +Then one approach is to use locking, for example, as follows:: struct foo { int a; @@ -299,6 +303,7 @@ As always, use the right tool for the job! EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH +----------------------------------------- If a pointer obtained from rcu_dereference() compares not-equal to some other pointer, the compiler normally has no clue what the value of the @@ -308,7 +313,7 @@ guarantees that RCU depends on. And the volatile cast in rcu_dereference() should prevent the compiler from guessing the value. But without rcu_dereference(), the compiler knows more than you might -expect. Consider the following code fragment: +expect. Consider the following code fragment:: struct foo { int a; @@ -354,6 +359,7 @@ dereference the resulting pointer. WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE? +------------------------------------------------------------ First, please avoid using rcu_dereference_raw() and also please avoid using rcu_dereference_check() and rcu_dereference_protected() with a @@ -370,7 +376,7 @@ member of the rcu_dereference() to use in various situations: 2. If the access might be within an RCU read-side critical section on the one hand, or protected by (say) my_lock on the other, - use rcu_dereference_check(), for example: + use rcu_dereference_check(), for example:: p1 = rcu_dereference_check(p->rcu_protected_pointer, lockdep_is_held(&my_lock)); @@ -378,14 +384,14 @@ member of the rcu_dereference() to use in various situations: 3. If the access might be within an RCU read-side critical section on the one hand, or protected by either my_lock or your_lock on - the other, again use rcu_dereference_check(), for example: + the other, again use rcu_dereference_check(), for example:: p1 = rcu_dereference_check(p->rcu_protected_pointer, lockdep_is_held(&my_lock) || lockdep_is_held(&your_lock)); 4. If the access is on the update side, so that it is always protected - by my_lock, use rcu_dereference_protected(): + by my_lock, use rcu_dereference_protected():: p1 = rcu_dereference_protected(p->rcu_protected_pointer, lockdep_is_held(&my_lock)); @@ -410,18 +416,19 @@ member of the rcu_dereference() to use in various situations: SPARSE CHECKING OF RCU-PROTECTED POINTERS +----------------------------------------- The sparse static-analysis tool checks for direct access to RCU-protected pointers, which can result in "interesting" bugs due to compiler optimizations involving invented loads and perhaps also load tearing. -For example, suppose someone mistakenly does something like this: +For example, suppose someone mistakenly does something like this:: p = q->rcu_protected_pointer; do_something_with(p->a); do_something_else_with(p->b); If register pressure is high, the compiler might optimize "p" out -of existence, transforming the code to something like this: +of existence, transforming the code to something like this:: do_something_with(q->rcu_protected_pointer->a); do_something_else_with(q->rcu_protected_pointer->b); @@ -435,7 +442,7 @@ Load tearing could of course result in dereferencing a mashup of a pair of pointers, which also might fatally disappoint your code. These problems could have been avoided simply by making the code instead -read as follows: +read as follows:: p = rcu_dereference(q->rcu_protected_pointer); do_something_with(p->a); @@ -448,7 +455,7 @@ or as a formal parameter, with "__rcu", which tells sparse to complain if this pointer is accessed directly. It will also cause sparse to complain if a pointer not marked with "__rcu" is accessed using rcu_dereference() and friends. For example, ->rcu_protected_pointer might be declared as -follows: +follows:: struct foo __rcu *rcu_protected_pointer; diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.rst index a2782df69732..f64f4413a47c 100644 --- a/Documentation/RCU/rcubarrier.txt +++ b/Documentation/RCU/rcubarrier.rst @@ -1,4 +1,7 @@ +.. _rcu_barrier: + RCU and Unloadable Modules +========================== [Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/] @@ -21,7 +24,7 @@ given that readers might well leave absolutely no trace of their presence? There is a synchronize_rcu() primitive that blocks until all pre-existing readers have completed. An updater wishing to delete an element p from a linked list might do the following, while holding an -appropriate lock, of course: +appropriate lock, of course:: list_del_rcu(p); synchronize_rcu(); @@ -32,13 +35,13 @@ primitive must be used instead. This primitive takes a pointer to an rcu_head struct placed within the RCU-protected data structure and another pointer to a function that may be invoked later to free that structure. Code to delete an element p from the linked list from IRQ -context might then be as follows: +context might then be as follows:: list_del_rcu(p); call_rcu(&p->rcu, p_callback); Since call_rcu() never blocks, this code can safely be used from within -IRQ context. The function p_callback() might be defined as follows: +IRQ context. The function p_callback() might be defined as follows:: static void p_callback(struct rcu_head *rp) { @@ -49,6 +52,7 @@ IRQ context. The function p_callback() might be defined as follows: Unloading Modules That Use call_rcu() +------------------------------------- But what if p_callback is defined in an unloadable module? @@ -69,10 +73,11 @@ in realtime kernels in order to avoid excessive scheduling latencies. rcu_barrier() +------------- We instead need the rcu_barrier() primitive. Rather than waiting for a grace period to elapse, rcu_barrier() waits for all outstanding RCU -callbacks to complete. Please note that rcu_barrier() does -not- imply +callbacks to complete. Please note that rcu_barrier() does **not** imply synchronize_rcu(), in particular, if there are no RCU callbacks queued anywhere, rcu_barrier() is within its rights to return immediately, without waiting for a grace period to elapse. @@ -88,79 +93,79 @@ must match the flavor of rcu_barrier() with that of call_rcu(). If your module uses multiple flavors of call_rcu(), then it must also use multiple flavors of rcu_barrier() when unloading that module. For example, if it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on -srcu_struct_2(), then the following three lines of code will be required -when unloading: +srcu_struct_2, then the following three lines of code will be required +when unloading:: 1 rcu_barrier(); 2 srcu_barrier(&srcu_struct_1); 3 srcu_barrier(&srcu_struct_2); The rcutorture module makes use of rcu_barrier() in its exit function -as follows: +as follows:: - 1 static void - 2 rcu_torture_cleanup(void) - 3 { - 4 int i; + 1 static void + 2 rcu_torture_cleanup(void) + 3 { + 4 int i; 5 - 6 fullstop = 1; - 7 if (shuffler_task != NULL) { + 6 fullstop = 1; + 7 if (shuffler_task != NULL) { 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); 9 kthread_stop(shuffler_task); -10 } -11 shuffler_task = NULL; -12 -13 if (writer_task != NULL) { -14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); -15 kthread_stop(writer_task); -16 } -17 writer_task = NULL; -18 -19 if (reader_tasks != NULL) { -20 for (i = 0; i < nrealreaders; i++) { -21 if (reader_tasks[i] != NULL) { -22 VERBOSE_PRINTK_STRING( -23 "Stopping rcu_torture_reader task"); -24 kthread_stop(reader_tasks[i]); -25 } -26 reader_tasks[i] = NULL; -27 } -28 kfree(reader_tasks); -29 reader_tasks = NULL; -30 } -31 rcu_torture_current = NULL; -32 -33 if (fakewriter_tasks != NULL) { -34 for (i = 0; i < nfakewriters; i++) { -35 if (fakewriter_tasks[i] != NULL) { -36 VERBOSE_PRINTK_STRING( -37 "Stopping rcu_torture_fakewriter task"); -38 kthread_stop(fakewriter_tasks[i]); -39 } -40 fakewriter_tasks[i] = NULL; -41 } -42 kfree(fakewriter_tasks); -43 fakewriter_tasks = NULL; -44 } -45 -46 if (stats_task != NULL) { -47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); -48 kthread_stop(stats_task); -49 } -50 stats_task = NULL; -51 -52 /* Wait for all RCU callbacks to fire. */ -53 rcu_barrier(); -54 -55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ -56 -57 if (cur_ops->cleanup != NULL) -58 cur_ops->cleanup(); -59 if (atomic_read(&n_rcu_torture_error)) -60 rcu_torture_print_module_parms("End of test: FAILURE"); -61 else -62 rcu_torture_print_module_parms("End of test: SUCCESS"); -63 } + 10 } + 11 shuffler_task = NULL; + 12 + 13 if (writer_task != NULL) { + 14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + 15 kthread_stop(writer_task); + 16 } + 17 writer_task = NULL; + 18 + 19 if (reader_tasks != NULL) { + 20 for (i = 0; i < nrealreaders; i++) { + 21 if (reader_tasks[i] != NULL) { + 22 VERBOSE_PRINTK_STRING( + 23 "Stopping rcu_torture_reader task"); + 24 kthread_stop(reader_tasks[i]); + 25 } + 26 reader_tasks[i] = NULL; + 27 } + 28 kfree(reader_tasks); + 29 reader_tasks = NULL; + 30 } + 31 rcu_torture_current = NULL; + 32 + 33 if (fakewriter_tasks != NULL) { + 34 for (i = 0; i < nfakewriters; i++) { + 35 if (fakewriter_tasks[i] != NULL) { + 36 VERBOSE_PRINTK_STRING( + 37 "Stopping rcu_torture_fakewriter task"); + 38 kthread_stop(fakewriter_tasks[i]); + 39 } + 40 fakewriter_tasks[i] = NULL; + 41 } + 42 kfree(fakewriter_tasks); + 43 fakewriter_tasks = NULL; + 44 } + 45 + 46 if (stats_task != NULL) { + 47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + 48 kthread_stop(stats_task); + 49 } + 50 stats_task = NULL; + 51 + 52 /* Wait for all RCU callbacks to fire. */ + 53 rcu_barrier(); + 54 + 55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + 56 + 57 if (cur_ops->cleanup != NULL) + 58 cur_ops->cleanup(); + 59 if (atomic_read(&n_rcu_torture_error)) + 60 rcu_torture_print_module_parms("End of test: FAILURE"); + 61 else + 62 rcu_torture_print_module_parms("End of test: SUCCESS"); + 63 } Line 6 sets a global variable that prevents any RCU callbacks from re-posting themselves. This will not be necessary in most cases, since @@ -176,9 +181,14 @@ for any pre-existing callbacks to complete. Then lines 55-62 print status and do operation-specific cleanup, and then return, permitting the module-unload operation to be completed. -Quick Quiz #1: Is there any other situation where rcu_barrier() might +.. _rcubarrier_quiz_1: + +Quick Quiz #1: + Is there any other situation where rcu_barrier() might be required? +:ref:`Answer to Quick Quiz #1 <answer_rcubarrier_quiz_1>` + Your module might have additional complications. For example, if your module invokes call_rcu() from timers, you will need to first cancel all the timers, and only then invoke rcu_barrier() to wait for any remaining @@ -188,11 +198,12 @@ Of course, if you module uses call_rcu(), you will need to invoke rcu_barrier() before unloading. Similarly, if your module uses call_srcu(), you will need to invoke srcu_barrier() before unloading, and on the same srcu_struct structure. If your module uses call_rcu() --and- call_srcu(), then you will need to invoke rcu_barrier() -and- +**and** call_srcu(), then you will need to invoke rcu_barrier() **and** srcu_barrier(). Implementing rcu_barrier() +-------------------------- Dipankar Sarma's implementation of rcu_barrier() makes use of the fact that RCU callbacks are never reordered once queued on one of the per-CPU @@ -200,19 +211,19 @@ queues. His implementation queues an RCU callback on each of the per-CPU callback queues, and then waits until they have all started executing, at which point, all earlier RCU callbacks are guaranteed to have completed. -The original code for rcu_barrier() was as follows: +The original code for rcu_barrier() was as follows:: - 1 void rcu_barrier(void) - 2 { - 3 BUG_ON(in_interrupt()); - 4 /* Take cpucontrol mutex to protect against CPU hotplug */ - 5 mutex_lock(&rcu_barrier_mutex); - 6 init_completion(&rcu_barrier_completion); - 7 atomic_set(&rcu_barrier_cpu_count, 0); - 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1); - 9 wait_for_completion(&rcu_barrier_completion); -10 mutex_unlock(&rcu_barrier_mutex); -11 } + 1 void rcu_barrier(void) + 2 { + 3 BUG_ON(in_interrupt()); + 4 /* Take cpucontrol mutex to protect against CPU hotplug */ + 5 mutex_lock(&rcu_barrier_mutex); + 6 init_completion(&rcu_barrier_completion); + 7 atomic_set(&rcu_barrier_cpu_count, 0); + 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1); + 9 wait_for_completion(&rcu_barrier_completion); + 10 mutex_unlock(&rcu_barrier_mutex); + 11 } Line 3 verifies that the caller is in process context, and lines 5 and 10 use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the @@ -226,18 +237,18 @@ This code was rewritten in 2008 and several times thereafter, but this still gives the general idea. The rcu_barrier_func() runs on each CPU, where it invokes call_rcu() -to post an RCU callback, as follows: +to post an RCU callback, as follows:: - 1 static void rcu_barrier_func(void *notused) - 2 { - 3 int cpu = smp_processor_id(); - 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - 5 struct rcu_head *head; + 1 static void rcu_barrier_func(void *notused) + 2 { + 3 int cpu = smp_processor_id(); + 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + 5 struct rcu_head *head; 6 - 7 head = &rdp->barrier; - 8 atomic_inc(&rcu_barrier_cpu_count); - 9 call_rcu(head, rcu_barrier_callback); -10 } + 7 head = &rdp->barrier; + 8 atomic_inc(&rcu_barrier_cpu_count); + 9 call_rcu(head, rcu_barrier_callback); + 10 } Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure, which contains the struct rcu_head that needed for the later call to @@ -248,20 +259,25 @@ the current CPU's queue. The rcu_barrier_callback() function simply atomically decrements the rcu_barrier_cpu_count variable and finalizes the completion when it -reaches zero, as follows: +reaches zero, as follows:: 1 static void rcu_barrier_callback(struct rcu_head *notused) 2 { - 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - 4 complete(&rcu_barrier_completion); + 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + 4 complete(&rcu_barrier_completion); 5 } -Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes +.. _rcubarrier_quiz_2: + +Quick Quiz #2: + What happens if CPU 0's rcu_barrier_func() executes immediately (thus incrementing rcu_barrier_cpu_count to the value one), but the other CPU's rcu_barrier_func() invocations are delayed for a full grace period? Couldn't this result in rcu_barrier() returning prematurely? +:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>` + The current rcu_barrier() implementation is more complex, due to the need to avoid disturbing idle CPUs (especially on battery-powered systems) and the need to minimally disturb non-idle CPUs in real-time systems. @@ -269,6 +285,7 @@ However, the code above illustrates the concepts. rcu_barrier() Summary +--------------------- The rcu_barrier() primitive has seen relatively little use, since most code using RCU is in the core kernel rather than in modules. However, if @@ -277,8 +294,12 @@ so that your module may be safely unloaded. Answers to Quick Quizzes +------------------------ + +.. _answer_rcubarrier_quiz_1: -Quick Quiz #1: Is there any other situation where rcu_barrier() might +Quick Quiz #1: + Is there any other situation where rcu_barrier() might be required? Answer: Interestingly enough, rcu_barrier() was not originally @@ -292,7 +313,12 @@ Answer: Interestingly enough, rcu_barrier() was not originally implementing rcutorture, and found that rcu_barrier() solves this problem as well. -Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes +:ref:`Back to Quick Quiz #1 <rcubarrier_quiz_1>` + +.. _answer_rcubarrier_quiz_2: + +Quick Quiz #2: + What happens if CPU 0's rcu_barrier_func() executes immediately (thus incrementing rcu_barrier_cpu_count to the value one), but the other CPU's rcu_barrier_func() invocations are delayed for a full grace period? Couldn't this result in @@ -323,3 +349,5 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last is to add an rcu_read_lock() before line 8 of rcu_barrier() and an rcu_read_unlock() after line 8 of this same function. If you can think of a better change, please let me know! + +:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>` diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index f48f4621ccbc..a360a8796710 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -225,18 +225,13 @@ an estimate of the total number of RCU callbacks queued across all CPUs In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed for each CPU: - 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D + 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1 The "last_accelerate:" prints the low-order 16 bits (in hex) of the jiffies counter when this CPU last invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from -rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback -status, so that an "l" indicates that all callbacks were lazy at the start -of the last idle period and an "L" indicates that there are currently -no non-lazy callbacks (in both cases, "." is printed otherwise, as -shown above) and "D" indicates that dyntick-idle processing is enabled -("." is printed otherwise, for example, if disabled via the "nohz=" -kernel boot parameter). +rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle +processing is enabled. If the grace period ends just as the stall warning starts printing, there will be a spurious stall-warning message, which will include diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.rst index 58ba05c4d97f..c7f147b8034f 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.rst @@ -1,15 +1,18 @@ +.. _whatisrcu_doc: + What is RCU? -- "Read, Copy, Update" +====================================== Please note that the "What is RCU?" LWN series is an excellent place to start learning about RCU: -1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ -2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ -3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ -4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ - 2010 Big API Table http://lwn.net/Articles/419086/ -5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/ - 2014 Big API Table http://lwn.net/Articles/609973/ +| 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ +| 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ +| 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ +| 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ +| 2010 Big API Table http://lwn.net/Articles/419086/ +| 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/ +| 2014 Big API Table http://lwn.net/Articles/609973/ What is RCU? @@ -24,14 +27,21 @@ the experience has been that different people must take different paths to arrive at an understanding of RCU. This document provides several different paths, as follows: -1. RCU OVERVIEW -2. WHAT IS RCU'S CORE API? -3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? -4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? -5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? -6. ANALOGY WITH READER-WRITER LOCKING -7. FULL LIST OF RCU APIs -8. ANSWERS TO QUICK QUIZZES +:ref:`1. RCU OVERVIEW <1_whatisRCU>` + +:ref:`2. WHAT IS RCU'S CORE API? <2_whatisRCU>` + +:ref:`3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? <3_whatisRCU>` + +:ref:`4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? <4_whatisRCU>` + +:ref:`5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? <5_whatisRCU>` + +:ref:`6. ANALOGY WITH READER-WRITER LOCKING <6_whatisRCU>` + +:ref:`7. FULL LIST OF RCU APIs <7_whatisRCU>` + +:ref:`8. ANSWERS TO QUICK QUIZZES <8_whatisRCU>` People who prefer starting with a conceptual overview should focus on Section 1, though most readers will profit by reading this section at @@ -49,8 +59,10 @@ everything, feel free to read the whole thing -- but if you are really that type of person, you have perused the source code and will therefore never need this document anyway. ;-) +.. _1_whatisRCU: 1. RCU OVERVIEW +---------------- The basic idea behind RCU is to split updates into "removal" and "reclamation" phases. The removal phase removes references to data items @@ -116,8 +128,10 @@ So how the heck can a reclaimer tell when a reader is done, given that readers are not doing any sort of synchronization operations??? Read on to learn about how RCU's API makes this easy. +.. _2_whatisRCU: 2. WHAT IS RCU'S CORE API? +--------------------------- The core RCU API is quite small: @@ -136,7 +150,7 @@ later. See the kernel docbook documentation for more info, or look directly at the function header comments. rcu_read_lock() - +^^^^^^^^^^^^^^^ void rcu_read_lock(void); Used by a reader to inform the reclaimer that the reader is @@ -150,7 +164,7 @@ rcu_read_lock() longer-term references to data structures. rcu_read_unlock() - +^^^^^^^^^^^^^^^^^ void rcu_read_unlock(void); Used by a reader to inform the reclaimer that the reader is @@ -158,15 +172,15 @@ rcu_read_unlock() read-side critical sections may be nested and/or overlapping. synchronize_rcu() - +^^^^^^^^^^^^^^^^^ void synchronize_rcu(void); Marks the end of updater code and the beginning of reclaimer code. It does this by blocking until all pre-existing RCU read-side critical sections on all CPUs have completed. - Note that synchronize_rcu() will -not- necessarily wait for + Note that synchronize_rcu() will **not** necessarily wait for any subsequent RCU read-side critical sections to complete. - For example, consider the following sequence of events: + For example, consider the following sequence of events:: CPU 0 CPU 1 CPU 2 ----------------- ------------------------- --------------- @@ -182,7 +196,7 @@ synchronize_rcu() any that begin after synchronize_rcu() is invoked. Of course, synchronize_rcu() does not necessarily return - -immediately- after the last pre-existing RCU read-side critical + **immediately** after the last pre-existing RCU read-side critical section completes. For one thing, there might well be scheduling delays. For another thing, many RCU implementations process requests in batches in order to improve efficiencies, which can @@ -211,10 +225,10 @@ synchronize_rcu() checklist.txt for some approaches to limiting the update rate. rcu_assign_pointer() - +^^^^^^^^^^^^^^^^^^^^ void rcu_assign_pointer(p, typeof(p) v); - Yes, rcu_assign_pointer() -is- implemented as a macro, though it + Yes, rcu_assign_pointer() **is** implemented as a macro, though it would be cool to be able to declare a function in this manner. (Compiler experts will no doubt disagree.) @@ -231,7 +245,7 @@ rcu_assign_pointer() the _rcu list-manipulation primitives such as list_add_rcu(). rcu_dereference() - +^^^^^^^^^^^^^^^^^ typeof(p) rcu_dereference(p); Like rcu_assign_pointer(), rcu_dereference() must be implemented @@ -248,13 +262,13 @@ rcu_dereference() Common coding practice uses rcu_dereference() to copy an RCU-protected pointer to a local variable, then dereferences - this local variable, for example as follows: + this local variable, for example as follows:: p = rcu_dereference(head.next); return p->data; However, in this case, one could just as easily combine these - into one statement: + into one statement:: return rcu_dereference(head.next)->data; @@ -266,8 +280,8 @@ rcu_dereference() unnecessary overhead on Alpha CPUs. Note that the value returned by rcu_dereference() is valid - only within the enclosing RCU read-side critical section [1]. - For example, the following is -not- legal: + only within the enclosing RCU read-side critical section [1]_. + For example, the following is **not** legal:: rcu_read_lock(); p = rcu_dereference(head.next); @@ -290,9 +304,9 @@ rcu_dereference() at any time, including immediately after the rcu_dereference(). And, again like rcu_assign_pointer(), rcu_dereference() is typically used indirectly, via the _rcu list-manipulation - primitives, such as list_for_each_entry_rcu() [2]. + primitives, such as list_for_each_entry_rcu() [2]_. - [1] The variant rcu_dereference_protected() can be used outside +.. [1] The variant rcu_dereference_protected() can be used outside of an RCU read-side critical section as long as the usage is protected by locks acquired by the update-side code. This variant avoids the lockdep warning that would happen when using (for @@ -305,7 +319,7 @@ rcu_dereference() a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst and the API's code comments for more details and example usage. - [2] If the list_for_each_entry_rcu() instance might be used by +.. [2] If the list_for_each_entry_rcu() instance might be used by update-side code as well as by RCU readers, then an additional lockdep expression can be added to its list of arguments. For example, given an additional "lock_is_held(&mylock)" argument, @@ -315,6 +329,7 @@ rcu_dereference() The following diagram shows how each API communicates among the reader, updater, and reclaimer. +:: rcu_assign_pointer() @@ -375,12 +390,16 @@ c. RCU applied to scheduler and interrupt/NMI-handler tasks. Again, most uses will be of (a). The (b) and (c) cases are important for specialized uses, but are relatively uncommon. +.. _3_whatisRCU: 3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? +----------------------------------------------- This section shows a simple use of the core RCU API to protect a global pointer to a dynamically allocated structure. More-typical -uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt. +uses of RCU may be found in :ref:`listRCU.rst <list_rcu_doc>`, +:ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst <NMI_rcu_doc>`. +:: struct foo { int a; @@ -440,40 +459,43 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt. So, to sum up: -o Use rcu_read_lock() and rcu_read_unlock() to guard RCU +- Use rcu_read_lock() and rcu_read_unlock() to guard RCU read-side critical sections. -o Within an RCU read-side critical section, use rcu_dereference() +- Within an RCU read-side critical section, use rcu_dereference() to dereference RCU-protected pointers. -o Use some solid scheme (such as locks or semaphores) to +- Use some solid scheme (such as locks or semaphores) to keep concurrent updates from interfering with each other. -o Use rcu_assign_pointer() to update an RCU-protected pointer. +- Use rcu_assign_pointer() to update an RCU-protected pointer. This primitive protects concurrent readers from the updater, - -not- concurrent updates from each other! You therefore still + **not** concurrent updates from each other! You therefore still need to use locking (or something similar) to keep concurrent rcu_assign_pointer() primitives from interfering with each other. -o Use synchronize_rcu() -after- removing a data element from an - RCU-protected data structure, but -before- reclaiming/freeing +- Use synchronize_rcu() **after** removing a data element from an + RCU-protected data structure, but **before** reclaiming/freeing the data element, in order to wait for the completion of all RCU read-side critical sections that might be referencing that data item. See checklist.txt for additional rules to follow when using RCU. -And again, more-typical uses of RCU may be found in listRCU.txt, -arrayRCU.txt, and NMI-RCU.txt. +And again, more-typical uses of RCU may be found in :ref:`listRCU.rst +<list_rcu_doc>`, :ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst +<NMI_rcu_doc>`. +.. _4_whatisRCU: 4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? +-------------------------------------------- In the example above, foo_update_a() blocks until a grace period elapses. This is quite simple, but in some cases one cannot afford to wait so long -- there might be other high-priority work to be done. In such cases, one uses call_rcu() rather than synchronize_rcu(). -The call_rcu() API is as follows: +The call_rcu() API is as follows:: void call_rcu(struct rcu_head * head, void (*func)(struct rcu_head *head)); @@ -481,7 +503,7 @@ The call_rcu() API is as follows: This function invokes func(head) after a grace period has elapsed. This invocation might happen from either softirq or process context, so the function is not permitted to block. The foo struct needs to -have an rcu_head structure added, perhaps as follows: +have an rcu_head structure added, perhaps as follows:: struct foo { int a; @@ -490,7 +512,7 @@ have an rcu_head structure added, perhaps as follows: struct rcu_head rcu; }; -The foo_update_a() function might then be written as follows: +The foo_update_a() function might then be written as follows:: /* * Create a new struct foo that is the same as the one currently @@ -520,7 +542,7 @@ The foo_update_a() function might then be written as follows: call_rcu(&old_fp->rcu, foo_reclaim); } -The foo_reclaim() function might appear as follows: +The foo_reclaim() function might appear as follows:: void foo_reclaim(struct rcu_head *rp) { @@ -544,7 +566,7 @@ namely foo_reclaim(). The summary of advice is the same as for the previous section, except that we are now using call_rcu() rather than synchronize_rcu(): -o Use call_rcu() -after- removing a data element from an +- Use call_rcu() **after** removing a data element from an RCU-protected data structure in order to register a callback function that will be invoked after the completion of all RCU read-side critical sections that might be referencing that @@ -552,14 +574,16 @@ o Use call_rcu() -after- removing a data element from an If the callback for call_rcu() is not doing anything more than calling kfree() on the structure, you can use kfree_rcu() instead of call_rcu() -to avoid having to write your own callback: +to avoid having to write your own callback:: kfree_rcu(old_fp, rcu); Again, see checklist.txt for additional rules governing the use of RCU. +.. _5_whatisRCU: 5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? +------------------------------------------------ One of the nice things about RCU is that it has extremely simple "toy" implementations that are a good first step towards understanding the @@ -579,7 +603,7 @@ more details on the current implementation as of early 2004. 5A. "TOY" IMPLEMENTATION #1: LOCKING - +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This section presents a "toy" RCU implementation that is based on familiar locking primitives. Its overhead makes it a non-starter for real-life use, as does its lack of scalability. It is also unsuitable @@ -591,7 +615,7 @@ you allow nested rcu_read_lock() calls, you can deadlock. However, it is probably the easiest implementation to relate to, so is a good starting point. -It is extremely simple: +It is extremely simple:: static DEFINE_RWLOCK(rcu_gp_mutex); @@ -614,7 +638,7 @@ It is extremely simple: [You can ignore rcu_assign_pointer() and rcu_dereference() without missing much. But here are simplified versions anyway. And whatever you do, -don't forget about them when submitting patches making use of RCU!] +don't forget about them when submitting patches making use of RCU!]:: #define rcu_assign_pointer(p, v) \ ({ \ @@ -647,18 +671,23 @@ that the only thing that can block rcu_read_lock() is a synchronize_rcu(). But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex, so there can be no deadlock cycle. -Quick Quiz #1: Why is this argument naive? How could a deadlock +.. _quiz_1: + +Quick Quiz #1: + Why is this argument naive? How could a deadlock occur when using this algorithm in a real-world Linux kernel? How could this deadlock be avoided? +:ref:`Answers to Quick Quiz <8_whatisRCU>` 5B. "TOY" EXAMPLE #2: CLASSIC RCU - +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This section presents a "toy" RCU implementation that is based on "classic RCU". It is also short on performance (but only for updates) and on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT kernels. The definitions of rcu_dereference() and rcu_assign_pointer() are the same as those shown in the preceding section, so they are omitted. +:: void rcu_read_lock(void) { } @@ -683,14 +712,14 @@ CPU in turn. The run_on() primitive can be implemented straightforwardly in terms of the sched_setaffinity() primitive. Of course, a somewhat less "toy" implementation would restore the affinity upon completion rather than just leaving all tasks running on the last CPU, but when I said -"toy", I meant -toy-! +"toy", I meant **toy**! So how the heck is this supposed to work??? Remember that it is illegal to block while in an RCU read-side critical section. Therefore, if a given CPU executes a context switch, we know that it must have completed all preceding RCU read-side critical sections. -Once -all- CPUs have executed a context switch, then -all- preceding +Once **all** CPUs have executed a context switch, then **all** preceding RCU read-side critical sections will have completed. So, suppose that we remove a data item from its structure and then invoke @@ -698,19 +727,32 @@ synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed that there are no RCU read-side critical sections holding a reference to that data item, so we can safely reclaim it. -Quick Quiz #2: Give an example where Classic RCU's read-side - overhead is -negative-. +.. _quiz_2: + +Quick Quiz #2: + Give an example where Classic RCU's read-side + overhead is **negative**. + +:ref:`Answers to Quick Quiz <8_whatisRCU>` -Quick Quiz #3: If it is illegal to block in an RCU read-side +.. _quiz_3: + +Quick Quiz #3: + If it is illegal to block in an RCU read-side critical section, what the heck do you do in PREEMPT_RT, where normal spinlocks can block??? +:ref:`Answers to Quick Quiz <8_whatisRCU>` + +.. _6_whatisRCU: 6. ANALOGY WITH READER-WRITER LOCKING +-------------------------------------- Although RCU can be used in many different ways, a very common use of RCU is analogous to reader-writer locking. The following unified diff shows how closely related RCU and reader-writer locking can be. +:: @@ -5,5 +5,5 @@ struct el { int data; @@ -762,7 +804,7 @@ diff shows how closely related RCU and reader-writer locking can be. return 0; } -Or, for those who prefer a side-by-side listing: +Or, for those who prefer a side-by-side listing:: 1 struct el { 1 struct el { 2 struct list_head list; 2 struct list_head list; @@ -774,40 +816,44 @@ Or, for those who prefer a side-by-side listing: 8 rwlock_t listmutex; 8 spinlock_t listmutex; 9 struct el head; 9 struct el head; - 1 int search(long key, int *result) 1 int search(long key, int *result) - 2 { 2 { - 3 struct list_head *lp; 3 struct list_head *lp; - 4 struct el *p; 4 struct el *p; - 5 5 - 6 read_lock(&listmutex); 6 rcu_read_lock(); - 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) { - 8 if (p->key == key) { 8 if (p->key == key) { - 9 *result = p->data; 9 *result = p->data; -10 read_unlock(&listmutex); 10 rcu_read_unlock(); -11 return 1; 11 return 1; -12 } 12 } -13 } 13 } -14 read_unlock(&listmutex); 14 rcu_read_unlock(); -15 return 0; 15 return 0; -16 } 16 } - - 1 int delete(long key) 1 int delete(long key) - 2 { 2 { - 3 struct el *p; 3 struct el *p; - 4 4 - 5 write_lock(&listmutex); 5 spin_lock(&listmutex); - 6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) { - 7 if (p->key == key) { 7 if (p->key == key) { - 8 list_del(&p->list); 8 list_del_rcu(&p->list); - 9 write_unlock(&listmutex); 9 spin_unlock(&listmutex); - 10 synchronize_rcu(); -10 kfree(p); 11 kfree(p); -11 return 1; 12 return 1; -12 } 13 } -13 } 14 } -14 write_unlock(&listmutex); 15 spin_unlock(&listmutex); -15 return 0; 16 return 0; -16 } 17 } +:: + + 1 int search(long key, int *result) 1 int search(long key, int *result) + 2 { 2 { + 3 struct list_head *lp; 3 struct list_head *lp; + 4 struct el *p; 4 struct el *p; + 5 5 + 6 read_lock(&listmutex); 6 rcu_read_lock(); + 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) { + 8 if (p->key == key) { 8 if (p->key == key) { + 9 *result = p->data; 9 *result = p->data; + 10 read_unlock(&listmutex); 10 rcu_read_unlock(); + 11 return 1; 11 return 1; + 12 } 12 } + 13 } 13 } + 14 read_unlock(&listmutex); 14 rcu_read_unlock(); + 15 return 0; 15 return 0; + 16 } 16 } + +:: + + 1 int delete(long key) 1 int delete(long key) + 2 { 2 { + 3 struct el *p; 3 struct el *p; + 4 4 + 5 write_lock(&listmutex); 5 spin_lock(&listmutex); + 6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) { + 7 if (p->key == key) { 7 if (p->key == key) { + 8 list_del(&p->list); 8 list_del_rcu(&p->list); + 9 write_unlock(&listmutex); 9 spin_unlock(&listmutex); + 10 synchronize_rcu(); + 10 kfree(p); 11 kfree(p); + 11 return 1; 12 return 1; + 12 } 13 } + 13 } 14 } + 14 write_unlock(&listmutex); 15 spin_unlock(&listmutex); + 15 return 0; 16 return 0; + 16 } 17 } Either way, the differences are quite small. Read-side locking moves to rcu_read_lock() and rcu_read_unlock, update-side locking moves from @@ -825,22 +871,27 @@ delete() can now block. If this is a problem, there is a callback-based mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can be used in place of synchronize_rcu(). +.. _7_whatisRCU: 7. FULL LIST OF RCU APIs +------------------------- The RCU APIs are documented in docbook-format header comments in the Linux-kernel source code, but it helps to have a full list of the APIs, since there does not appear to be a way to categorize them in docbook. Here is the list, by category. -RCU list traversal: +RCU list traversal:: list_entry_rcu + list_entry_lockless list_first_entry_rcu list_next_rcu list_for_each_entry_rcu list_for_each_entry_continue_rcu list_for_each_entry_from_rcu + list_first_or_null_rcu + list_next_or_null_rcu hlist_first_rcu hlist_next_rcu hlist_pprev_rcu @@ -854,7 +905,7 @@ RCU list traversal: hlist_bl_first_rcu hlist_bl_for_each_entry_rcu -RCU pointer/list update: +RCU pointer/list update:: rcu_assign_pointer list_add_rcu @@ -864,10 +915,12 @@ RCU pointer/list update: hlist_add_behind_rcu hlist_add_before_rcu hlist_add_head_rcu + hlist_add_tail_rcu hlist_del_rcu hlist_del_init_rcu hlist_replace_rcu - list_splice_init_rcu() + list_splice_init_rcu + list_splice_tail_init_rcu hlist_nulls_del_init_rcu hlist_nulls_del_rcu hlist_nulls_add_head_rcu @@ -876,7 +929,9 @@ RCU pointer/list update: hlist_bl_del_rcu hlist_bl_set_first_rcu -RCU: Critical sections Grace period Barrier +RCU:: + + Critical sections Grace period Barrier rcu_read_lock synchronize_net rcu_barrier rcu_read_unlock synchronize_rcu @@ -885,7 +940,9 @@ RCU: Critical sections Grace period Barrier rcu_dereference_check kfree_rcu rcu_dereference_protected -bh: Critical sections Grace period Barrier +bh:: + + Critical sections Grace period Barrier rcu_read_lock_bh call_rcu rcu_barrier rcu_read_unlock_bh synchronize_rcu @@ -896,7 +953,9 @@ bh: Critical sections Grace period Barrier rcu_dereference_bh_protected rcu_read_lock_bh_held -sched: Critical sections Grace period Barrier +sched:: + + Critical sections Grace period Barrier rcu_read_lock_sched call_rcu rcu_barrier rcu_read_unlock_sched synchronize_rcu @@ -910,7 +969,9 @@ sched: Critical sections Grace period Barrier rcu_read_lock_sched_held -SRCU: Critical sections Grace period Barrier +SRCU:: + + Critical sections Grace period Barrier srcu_read_lock call_srcu srcu_barrier srcu_read_unlock synchronize_srcu @@ -918,13 +979,14 @@ SRCU: Critical sections Grace period Barrier srcu_dereference_check srcu_read_lock_held -SRCU: Initialization/cleanup +SRCU: Initialization/cleanup:: + DEFINE_SRCU DEFINE_STATIC_SRCU init_srcu_struct cleanup_srcu_struct -All: lockdep-checked RCU-protected pointer access +All: lockdep-checked RCU-protected pointer access:: rcu_access_pointer rcu_dereference_raw @@ -974,15 +1036,19 @@ g. Otherwise, use RCU. Of course, this all assumes that you have determined that RCU is in fact the right tool for your job. +.. _8_whatisRCU: 8. ANSWERS TO QUICK QUIZZES +---------------------------- -Quick Quiz #1: Why is this argument naive? How could a deadlock +Quick Quiz #1: + Why is this argument naive? How could a deadlock occur when using this algorithm in a real-world Linux kernel? [Referring to the lock-based "toy" RCU algorithm.] -Answer: Consider the following sequence of events: +Answer: + Consider the following sequence of events: 1. CPU 0 acquires some unrelated lock, call it "problematic_lock", disabling irq via @@ -1021,10 +1087,14 @@ Answer: Consider the following sequence of events: approach where tasks in RCU read-side critical sections cannot be blocked by tasks executing synchronize_rcu(). -Quick Quiz #2: Give an example where Classic RCU's read-side - overhead is -negative-. +:ref:`Back to Quick Quiz #1 <quiz_1>` + +Quick Quiz #2: + Give an example where Classic RCU's read-side + overhead is **negative**. -Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT +Answer: + Imagine a single-CPU system with a non-CONFIG_PREEMPT kernel where a routing table is used by process-context code, but can be updated by irq-context code (for example, by an "ICMP REDIRECT" packet). The usual way of handling @@ -1046,11 +1116,15 @@ Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT even the theoretical possibility of negative overhead for a synchronization primitive is a bit unexpected. ;-) -Quick Quiz #3: If it is illegal to block in an RCU read-side +:ref:`Back to Quick Quiz #2 <quiz_2>` + +Quick Quiz #3: + If it is illegal to block in an RCU read-side critical section, what the heck do you do in PREEMPT_RT, where normal spinlocks can block??? -Answer: Just as PREEMPT_RT permits preemption of spinlock +Answer: + Just as PREEMPT_RT permits preemption of spinlock critical sections, it permits preemption of RCU read-side critical sections. It also permits spinlocks blocking while in RCU read-side critical @@ -1069,6 +1143,7 @@ Answer: Just as PREEMPT_RT permits preemption of spinlock Besides, how does the computer know what pizza parlor the human being went to??? +:ref:`Back to Quick Quiz #3 <quiz_3>` ACKNOWLEDGEMENTS diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index ade4e6ec23e0..ed83d6d90cc3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3978,6 +3978,19 @@ test until boot completes in order to avoid interference. + rcuperf.kfree_rcu_test= [KNL] + Set to measure performance of kfree_rcu() flooding. + + rcuperf.kfree_nthreads= [KNL] + The number of threads running loops of kfree_rcu(). + + rcuperf.kfree_alloc_num= [KNL] + Number of allocations and frees done in an iteration. + + rcuperf.kfree_loops= [KNL] + Number of loops doing rcuperf.kfree_alloc_num number + of allocations and frees. + rcuperf.nreaders= [KNL] Set number of RCU readers. The value -1 selects N, where N is the number of CPUs. A value diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index fbe8df433019..123adcefd40f 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -18,8 +18,6 @@ * mb() prevents loads and stores being reordered across this point. * rmb() prevents loads being reordered across this point. * wmb() prevents stores being reordered across this point. - * read_barrier_depends() prevents data-dependent loads being reordered - * across this point (nop on PPC). * * *mb() variants without smp_ prefix must order all types of memory * operations with one another. sync is the only instruction sufficient diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c index 53b5a4b2dcc5..59c187898132 100644 --- a/drivers/net/wireless/mediatek/mt76/agg-rx.c +++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c @@ -281,8 +281,8 @@ void mt76_rx_aggr_stop(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tidno) { struct mt76_rx_tid *tid = NULL; - rcu_swap_protected(wcid->aggr[tidno], tid, - lockdep_is_held(&dev->mutex)); + tid = rcu_replace_pointer(wcid->aggr[tidno], tid, + lockdep_is_held(&dev->mutex)); if (tid) { mt76_rx_aggr_shutdown(dev, tid); kfree_rcu(tid, rcu_head); diff --git a/include/linux/list.h b/include/linux/list.h index 85c92555e31f..4f3b7f71bdfd 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -23,6 +23,13 @@ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) +/** + * INIT_LIST_HEAD - Initialize a list_head structure + * @list: list_head structure to be initialized. + * + * Initializes the list_head to point to itself. If it is a list header, + * the result is an empty list. + */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); @@ -120,12 +127,6 @@ static inline void __list_del_clearprev(struct list_head *entry) entry->prev = NULL; } -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty() on entry does not return true after this, the entry is - * in an undefined state. - */ static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) @@ -134,6 +135,12 @@ static inline void __list_del_entry(struct list_head *entry) __list_del(entry->prev, entry->next); } +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); @@ -157,8 +164,15 @@ static inline void list_replace(struct list_head *old, new->prev->next = new; } +/** + * list_replace_init - replace old entry by new one and initialize the old one + * @old : the element to be replaced + * @new : the new element to insert + * + * If @old was empty, it will be overwritten. + */ static inline void list_replace_init(struct list_head *old, - struct list_head *new) + struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); @@ -744,11 +758,36 @@ static inline void INIT_HLIST_NODE(struct hlist_node *h) h->pprev = NULL; } +/** + * hlist_unhashed - Has node been removed from list and reinitialized? + * @h: Node to be checked + * + * Not that not all removal functions will leave a node in unhashed + * state. For example, hlist_nulls_del_init_rcu() does leave the + * node in unhashed state, but hlist_nulls_del() does not. + */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } +/** + * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use + * @h: Node to be checked + * + * This variant of hlist_unhashed() must be used in lockless contexts + * to avoid potential load-tearing. The READ_ONCE() is paired with the + * various WRITE_ONCE() in hlist helpers that are defined below. + */ +static inline int hlist_unhashed_lockless(const struct hlist_node *h) +{ + return !READ_ONCE(h->pprev); +} + +/** + * hlist_empty - Is the specified hlist_head structure an empty hlist? + * @h: Structure to check. + */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); @@ -761,9 +800,16 @@ static inline void __hlist_del(struct hlist_node *n) WRITE_ONCE(*pprev, next); if (next) - next->pprev = pprev; + WRITE_ONCE(next->pprev, pprev); } +/** + * hlist_del - Delete the specified hlist_node from its list + * @n: Node to delete. + * + * Note that this function leaves the node in hashed state. Use + * hlist_del_init() or similar instead to unhash @n. + */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); @@ -771,6 +817,12 @@ static inline void hlist_del(struct hlist_node *n) n->pprev = LIST_POISON2; } +/** + * hlist_del_init - Delete the specified hlist_node from its list and initialize + * @n: Node to delete. + * + * Note that this function leaves the node in unhashed state. + */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { @@ -779,51 +831,83 @@ static inline void hlist_del_init(struct hlist_node *n) } } +/** + * hlist_add_head - add a new entry at the beginning of the hlist + * @n: new entry to be added + * @h: hlist head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; - n->next = first; + WRITE_ONCE(n->next, first); if (first) - first->pprev = &n->next; + WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); - n->pprev = &h->first; + WRITE_ONCE(n->pprev, &h->first); } -/* next must be != NULL */ +/** + * hlist_add_before - add a new entry before the one specified + * @n: new entry to be added + * @next: hlist node to add it before, which must be non-NULL + */ static inline void hlist_add_before(struct hlist_node *n, - struct hlist_node *next) + struct hlist_node *next) { - n->pprev = next->pprev; - n->next = next; - next->pprev = &n->next; + WRITE_ONCE(n->pprev, next->pprev); + WRITE_ONCE(n->next, next); + WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } +/** + * hlist_add_behing - add a new entry after the one specified + * @n: new entry to be added + * @prev: hlist node to add it after, which must be non-NULL + */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { - n->next = prev->next; - prev->next = n; - n->pprev = &prev->next; + WRITE_ONCE(n->next, prev->next); + WRITE_ONCE(prev->next, n); + WRITE_ONCE(n->pprev, &prev->next); if (n->next) - n->next->pprev = &n->next; + WRITE_ONCE(n->next->pprev, &n->next); } -/* after that we'll appear to be on some hlist and hlist_del will work */ +/** + * hlist_add_fake - create a fake hlist consisting of a single headless node + * @n: Node to make a fake list out of + * + * This makes @n appear to be its own predecessor on a headless hlist. + * The point of this is to allow things like hlist_del() to work correctly + * in cases where there is no list. + */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } +/** + * hlist_fake: Is this node a fake hlist? + * @h: Node to check for being a self-referential fake hlist. + */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } -/* +/** + * hlist_is_singular_node - is node the only element of the specified hlist? + * @n: Node to check for singularity. + * @h: Header for potentially singular list. + * * Check whether the node is the only node of the head without - * accessing head: + * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) @@ -831,7 +915,11 @@ hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) return !n->next && n->pprev == &h->first; } -/* +/** + * hlist_move_list - Move an hlist + * @old: hlist_head for old list. + * @new: hlist_head for new list. + * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index 3ef96743db8d..fa6e8471bd22 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -56,11 +56,33 @@ static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr) return ((unsigned long)ptr) >> 1; } +/** + * hlist_nulls_unhashed - Has node been removed and reinitialized? + * @h: Node to be checked + * + * Not that not all removal functions will leave a node in unhashed state. + * For example, hlist_del_init_rcu() leaves the node in unhashed state, + * but hlist_nulls_del() does not. + */ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) { return !h->pprev; } +/** + * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized? + * @h: Node to be checked + * + * Not that not all removal functions will leave a node in unhashed state. + * For example, hlist_del_init_rcu() leaves the node in unhashed state, + * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this + * function may be used locklessly. + */ +static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h) +{ + return !READ_ONCE(h->pprev); +} + static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) { return is_a_nulls(READ_ONCE(h->first)); @@ -72,10 +94,10 @@ static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, struct hlist_nulls_node *first = h->first; n->next = first; - n->pprev = &h->first; + WRITE_ONCE(n->pprev, &h->first); h->first = n; if (!is_a_nulls(first)) - first->pprev = &n->next; + WRITE_ONCE(first->pprev, &n->next); } static inline void __hlist_nulls_del(struct hlist_nulls_node *n) @@ -85,13 +107,13 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n) WRITE_ONCE(*pprev, next); if (!is_a_nulls(next)) - next->pprev = pprev; + WRITE_ONCE(next->pprev, pprev); } static inline void hlist_nulls_del(struct hlist_nulls_node *n) { __hlist_nulls_del(n); - n->pprev = LIST_POISON2; + WRITE_ONCE(n->pprev, LIST_POISON2); } /** diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 646759042333..b36afe7b22c9 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -22,7 +22,6 @@ struct rcu_cblist { struct rcu_head *head; struct rcu_head **tail; long len; - long len_lazy; }; #define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head } @@ -73,7 +72,6 @@ struct rcu_segcblist { #else long len; #endif - long len_lazy; u8 enabled; u8 offloaded; }; diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4158b7212936..9f313e4999fe 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -40,6 +40,16 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) +/** + * list_tail_rcu - returns the prev pointer of the head of the list + * @head: the head of the list + * + * Note: This should only be used with the list header, and even then + * only if list_del() and similar primitives are not also used on the + * list header. + */ +#define list_tail_rcu(head) (*((struct list_head __rcu **)(&(head)->prev))) + /* * Check during list traversal that we are within an RCU reader */ @@ -173,7 +183,7 @@ static inline void hlist_del_init_rcu(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); - n->pprev = NULL; + WRITE_ONCE(n->pprev, NULL); } } @@ -361,7 +371,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. - * @cond: optional lockdep expression if called from non-RCU protection. + * @cond...: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() @@ -473,7 +483,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, static inline void hlist_del_rcu(struct hlist_node *n) { __hlist_del(n); - n->pprev = LIST_POISON2; + WRITE_ONCE(n->pprev, LIST_POISON2); } /** @@ -489,11 +499,11 @@ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *next = old->next; new->next = next; - new->pprev = old->pprev; + WRITE_ONCE(new->pprev, old->pprev); rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) - new->next->pprev = &new->next; - old->pprev = LIST_POISON2; + WRITE_ONCE(new->next->pprev, &new->next); + WRITE_ONCE(old->pprev, LIST_POISON2); } /* @@ -528,10 +538,10 @@ static inline void hlist_add_head_rcu(struct hlist_node *n, struct hlist_node *first = h->first; n->next = first; - n->pprev = &h->first; + WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_first_rcu(h), n); if (first) - first->pprev = &n->next; + WRITE_ONCE(first->pprev, &n->next); } /** @@ -564,7 +574,7 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, if (last) { n->next = last->next; - n->pprev = &last->next; + WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_next_rcu(last), n); } else { hlist_add_head_rcu(n, h); @@ -592,10 +602,10 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { - n->pprev = next->pprev; + WRITE_ONCE(n->pprev, next->pprev); n->next = next; rcu_assign_pointer(hlist_pprev_rcu(n), n); - next->pprev = &n->next; + WRITE_ONCE(next->pprev, &n->next); } /** @@ -620,10 +630,10 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; - n->pprev = &prev->next; + WRITE_ONCE(n->pprev, &prev->next); rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) - n->next->pprev = &n->next; + WRITE_ONCE(n->next->pprev, &n->next); } #define __hlist_for_each_rcu(pos, head) \ @@ -636,7 +646,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. - * @cond: optional lockdep expression if called from non-RCU protection. + * @cond...: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index bc8206a8f30e..409a86bb5f25 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -34,13 +34,21 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) { if (!hlist_nulls_unhashed(n)) { __hlist_nulls_del(n); - n->pprev = NULL; + WRITE_ONCE(n->pprev, NULL); } } +/** + * hlist_nulls_first_rcu - returns the first element of the hash list. + * @head: the head of the list. + */ #define hlist_nulls_first_rcu(head) \ (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) +/** + * hlist_nulls_next_rcu - returns the element of the list after @node. + * @node: element of the list. + */ #define hlist_nulls_next_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) @@ -66,7 +74,7 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n) { __hlist_nulls_del(n); - n->pprev = LIST_POISON2; + WRITE_ONCE(n->pprev, LIST_POISON2); } /** @@ -94,17 +102,17 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, struct hlist_nulls_node *first = h->first; n->next = first; - n->pprev = &h->first; + WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) - first->pprev = &n->next; + WRITE_ONCE(first->pprev, &n->next); } /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. - * @head: the head for your list. + * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. * * The barrier() is needed to make sure compiler doesn't cache first element [1], @@ -124,7 +132,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, * iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. - * @head: the head for your list. + * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. */ #define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0b7506330c87..2678a37c3169 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -154,7 +154,7 @@ static inline void exit_tasks_rcu_finish(void) { } * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() - * machinery were to be shut off, as some advocate for PREEMPT kernels. + * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ @@ -167,7 +167,7 @@ do { \ * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) #include <linux/rcutree.h> #elif defined(CONFIG_TINY_RCU) #include <linux/rcutiny.h> @@ -401,22 +401,6 @@ do { \ }) /** - * rcu_swap_protected() - swap an RCU and a regular pointer - * @rcu_ptr: RCU pointer - * @ptr: regular pointer - * @c: the conditions under which the dereference will take place - * - * Perform swap(@rcu_ptr, @ptr) where @rcu_ptr is an RCU-annotated pointer and - * @c is the argument that is passed to the rcu_dereference_protected() call - * used to read that pointer. - */ -#define rcu_swap_protected(rcu_ptr, ptr, c) do { \ - typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ - rcu_assign_pointer((rcu_ptr), (ptr)); \ - (ptr) = __tmp; \ -} while (0) - -/** * rcu_access_pointer() - fetch RCU pointer with no dereferencing * @p: The pointer to read * @@ -598,10 +582,10 @@ do { \ * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU - * read-side critical section that would block in a !PREEMPT kernel. + * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * - * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), + * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, @@ -912,4 +896,8 @@ rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) return false; } +/* kernel/ksysfs.c definitions */ +extern int rcu_expedited; +extern int rcu_normal; + #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 37b6f0c2b79d..b2b2dc990da9 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -85,6 +85,7 @@ static inline void rcu_scheduler_starting(void) { } static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } +static inline void kfree_rcu_scheduler_running(void) { } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index c5147de885ec..2f787b9029d1 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -38,6 +38,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); +void kfree_rcu_scheduler_running(void); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); diff --git a/include/linux/tick.h b/include/linux/tick.h index 7896f792d3b0..7340613c7eff 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -109,8 +109,10 @@ enum tick_dep_bits { TICK_DEP_BIT_PERF_EVENTS = 1, TICK_DEP_BIT_SCHED = 2, TICK_DEP_BIT_CLOCK_UNSTABLE = 3, - TICK_DEP_BIT_RCU = 4 + TICK_DEP_BIT_RCU = 4, + TICK_DEP_BIT_RCU_EXP = 5 }; +#define TICK_DEP_BIT_MAX TICK_DEP_BIT_RCU_EXP #define TICK_DEP_MASK_NONE 0 #define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) @@ -118,6 +120,7 @@ enum tick_dep_bits { #define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) #define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) #define TICK_DEP_MASK_RCU (1 << TICK_DEP_BIT_RCU) +#define TICK_DEP_MASK_RCU_EXP (1 << TICK_DEP_BIT_RCU_EXP) #ifdef CONFIG_NO_HZ_COMMON extern bool tick_nohz_enabled; diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 66122602bd08..5e49b06e8104 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -41,7 +41,7 @@ TRACE_EVENT(rcu_utilization, TP_printk("%s", __entry->s) ); -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) /* * Tracepoint for grace-period events. Takes a string identifying the @@ -432,7 +432,7 @@ TRACE_EVENT_RCU(rcu_fqs, __entry->cpu, __entry->qsevent) ); -#endif /* #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) */ +#endif /* #if defined(CONFIG_TREE_RCU) */ /* * Tracepoint for dyntick-idle entry/exit events. These take a string @@ -449,7 +449,7 @@ TRACE_EVENT_RCU(rcu_fqs, */ TRACE_EVENT_RCU(rcu_dyntick, - TP_PROTO(const char *polarity, long oldnesting, long newnesting, atomic_t dynticks), + TP_PROTO(const char *polarity, long oldnesting, long newnesting, int dynticks), TP_ARGS(polarity, oldnesting, newnesting, dynticks), @@ -464,7 +464,7 @@ TRACE_EVENT_RCU(rcu_dyntick, __entry->polarity = polarity; __entry->oldnesting = oldnesting; __entry->newnesting = newnesting; - __entry->dynticks = atomic_read(&dynticks); + __entry->dynticks = dynticks; ), TP_printk("%s %lx %lx %#3x", __entry->polarity, @@ -481,16 +481,14 @@ TRACE_EVENT_RCU(rcu_dyntick, */ TRACE_EVENT_RCU(rcu_callback, - TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen_lazy, - long qlen), + TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen), - TP_ARGS(rcuname, rhp, qlen_lazy, qlen), + TP_ARGS(rcuname, rhp, qlen), TP_STRUCT__entry( __field(const char *, rcuname) __field(void *, rhp) __field(void *, func) - __field(long, qlen_lazy) __field(long, qlen) ), @@ -498,13 +496,12 @@ TRACE_EVENT_RCU(rcu_callback, __entry->rcuname = rcuname; __entry->rhp = rhp; __entry->func = rhp->func; - __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%ps %ld/%ld", + TP_printk("%s rhp=%p func=%ps %ld", __entry->rcuname, __entry->rhp, __entry->func, - __entry->qlen_lazy, __entry->qlen) + __entry->qlen) ); /* @@ -518,15 +515,14 @@ TRACE_EVENT_RCU(rcu_callback, TRACE_EVENT_RCU(rcu_kfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, - long qlen_lazy, long qlen), + long qlen), - TP_ARGS(rcuname, rhp, offset, qlen_lazy, qlen), + TP_ARGS(rcuname, rhp, offset, qlen), TP_STRUCT__entry( __field(const char *, rcuname) __field(void *, rhp) __field(unsigned long, offset) - __field(long, qlen_lazy) __field(long, qlen) ), @@ -534,13 +530,12 @@ TRACE_EVENT_RCU(rcu_kfree_callback, __entry->rcuname = rcuname; __entry->rhp = rhp; __entry->offset = offset; - __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%ld %ld/%ld", + TP_printk("%s rhp=%p func=%ld %ld", __entry->rcuname, __entry->rhp, __entry->offset, - __entry->qlen_lazy, __entry->qlen) + __entry->qlen) ); /* @@ -552,27 +547,24 @@ TRACE_EVENT_RCU(rcu_kfree_callback, */ TRACE_EVENT_RCU(rcu_batch_start, - TP_PROTO(const char *rcuname, long qlen_lazy, long qlen, long blimit), + TP_PROTO(const char *rcuname, long qlen, long blimit), - TP_ARGS(rcuname, qlen_lazy, qlen, blimit), + TP_ARGS(rcuname, qlen, blimit), TP_STRUCT__entry( __field(const char *, rcuname) - __field(long, qlen_lazy) __field(long, qlen) __field(long, blimit) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; __entry->blimit = blimit; ), - TP_printk("%s CBs=%ld/%ld bl=%ld", - __entry->rcuname, __entry->qlen_lazy, __entry->qlen, - __entry->blimit) + TP_printk("%s CBs=%ld bl=%ld", + __entry->rcuname, __entry->qlen, __entry->blimit) ); /* diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 7644eda17d62..1cc940fef17c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -7,7 +7,7 @@ menu "RCU Subsystem" config TREE_RCU bool - default y if !PREEMPTION && SMP + default y if SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -17,6 +17,7 @@ config TREE_RCU config PREEMPT_RCU bool default y if PREEMPTION + select TREE_RCU help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or @@ -78,7 +79,7 @@ config TASKS_RCU user-mode execution as quiescent states. config RCU_STALL_COMMON - def_bool ( TREE_RCU || PREEMPT_RCU ) + def_bool TREE_RCU help This option enables RCU CPU stall code that is common between the TINY and TREE variants of RCU. The purpose is to allow @@ -86,13 +87,13 @@ config RCU_STALL_COMMON making these warnings mandatory for the tree variants. config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) + def_bool ( TREE_RCU || TREE_SRCU ) config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + depends on TREE_RCU && RCU_EXPERT default 64 if 64BIT default 32 if !64BIT help @@ -112,7 +113,7 @@ config RCU_FANOUT_LEAF int "Tree-based hierarchical RCU leaf-level fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + depends on TREE_RCU && RCU_EXPERT default 16 help This option controls the leaf-level fanout of hierarchical @@ -187,7 +188,7 @@ config RCU_BOOST_DELAY config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" - depends on TREE_RCU || PREEMPT_RCU + depends on TREE_RCU depends on RCU_EXPERT || NO_HZ_FULL default n help @@ -200,8 +201,8 @@ config RCU_NOCB_CPU specified at boot time by the rcu_nocbs parameter. For each such CPU, a kthread ("rcuox/N") will be created to invoke callbacks, where the "N" is the CPU being offloaded, and where - the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched - (!PREEMPT kernels). Nothing prevents this kthread from running + the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched + (!PREEMPTION kernels). Nothing prevents this kthread from running on the specified CPUs, but (1) the kthreads may be preempted between each callback, and (2) affinity or cgroups can be used to force the kthreads to run on whatever set of CPUs is desired. diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 020e8b6a644b..82d5fba48b2f 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -9,6 +9,5 @@ obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o -obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..05f936ed167a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -198,33 +198,6 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -void kfree(const void *); - -/* - * Reclaim the specified callback, either by invoking it (non-lazy case) - * or freeing it directly (lazy case). Return true if lazy, false otherwise. - */ -static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) -{ - rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; - - rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback(rn, head, offset); - kfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } else { - trace_rcu_invoke_callback(rn, head); - f = head->func; - WRITE_ONCE(head->func, (rcu_callback_t)0L); - f(head); - rcu_lock_release(&rcu_callback_map); - return false; - } -} - #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; @@ -281,7 +254,7 @@ void rcu_test_sync_prims(void); */ extern void resched_cpu(int cpu); -#if defined(SRCU) || !defined(TINY_RCU) +#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) #include <linux/rcu_node_tree.h> @@ -418,7 +391,7 @@ do { \ #define raw_lockdep_assert_held_rcu_node(p) \ lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) -#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ +#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */ #ifdef CONFIG_SRCU void srcu_init(void); @@ -454,7 +427,7 @@ enum rcutorture_type { INVALID_RCU_FLAVOR }; -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq); void do_trace_rcu_torture_read(const char *rcutorturename, diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index cbc87b804db9..5f4fd3b8777c 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -20,14 +20,10 @@ void rcu_cblist_init(struct rcu_cblist *rclp) rclp->head = NULL; rclp->tail = &rclp->head; rclp->len = 0; - rclp->len_lazy = 0; } /* * Enqueue an rcu_head structure onto the specified callback list. - * This function assumes that the callback is non-lazy because it - * is intended for use by no-CBs CPUs, which do not distinguish - * between lazy and non-lazy RCU callbacks. */ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) { @@ -54,7 +50,6 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, else drclp->tail = &drclp->head; drclp->len = srclp->len; - drclp->len_lazy = srclp->len_lazy; if (!rhp) { rcu_cblist_init(srclp); } else { @@ -62,16 +57,12 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, srclp->head = rhp; srclp->tail = &rhp->next; WRITE_ONCE(srclp->len, 1); - srclp->len_lazy = 0; } } /* * Dequeue the oldest rcu_head structure from the specified callback - * list. This function assumes that the callback is non-lazy, but - * the caller can later invoke rcu_cblist_dequeued_lazy() if it - * finds otherwise (and if it cares about laziness). This allows - * different users to have different ways of determining laziness. + * list. */ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) { @@ -161,7 +152,6 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) for (i = 0; i < RCU_CBLIST_NSEGS; i++) rsclp->tails[i] = &rsclp->head; rcu_segcblist_set_len(rsclp, 0); - rsclp->len_lazy = 0; rsclp->enabled = 1; } @@ -173,7 +163,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) { WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); rsclp->enabled = 0; } @@ -253,11 +242,9 @@ bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) * absolutely not OK for it to ever miss posting a callback. */ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { rcu_segcblist_inc_len(rsclp); - if (lazy) - rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is enqueued. */ rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); @@ -275,15 +262,13 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, * period. You have been warned. */ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { int i; if (rcu_segcblist_n_cbs(rsclp) == 0) return false; rcu_segcblist_inc_len(rsclp); - if (lazy) - rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) @@ -307,8 +292,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rclp->len_lazy += rsclp->len_lazy; - rsclp->len_lazy = 0; rclp->len = rcu_segcblist_xchg_len(rsclp, 0); } @@ -361,9 +344,7 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rsclp->len_lazy += rclp->len_lazy; rcu_segcblist_add_len(rsclp, rclp->len); - rclp->len_lazy = 0; rclp->len = 0; } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 815c2fdd3fcc..5c293afc07b8 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,15 +15,6 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) return READ_ONCE(rclp->len); } -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) -{ - rclp->len_lazy--; -} - void rcu_cblist_init(struct rcu_cblist *rclp); void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, @@ -59,18 +50,6 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) #endif } -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) -{ - return rsclp->len_lazy; -} - -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy; -} - /* * Is the specified rcu_segcblist enabled, for example, not corresponding * to an offline CPU? @@ -106,9 +85,9 @@ struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 5f884d560384..da94b89cd531 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -86,6 +86,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, "Shutdown at end of performance tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); +torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -105,8 +106,8 @@ static atomic_t n_rcu_perf_writer_finished; static wait_queue_head_t shutdown_wq; static u64 t_rcu_perf_writer_started; static u64 t_rcu_perf_writer_finished; -static unsigned long b_rcu_perf_writer_started; -static unsigned long b_rcu_perf_writer_finished; +static unsigned long b_rcu_gp_test_started; +static unsigned long b_rcu_gp_test_finished; static DEFINE_PER_CPU(atomic_t, n_async_inflight); #define MAX_MEAS 10000 @@ -378,10 +379,10 @@ rcu_perf_writer(void *arg) if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; if (gp_exp) { - b_rcu_perf_writer_started = + b_rcu_gp_test_started = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_started = cur_ops->get_gp_seq(); + b_rcu_gp_test_started = cur_ops->get_gp_seq(); } } @@ -429,10 +430,10 @@ retry: PERFOUT_STRING("Test complete"); t_rcu_perf_writer_finished = t; if (gp_exp) { - b_rcu_perf_writer_finished = + b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_finished = + b_rcu_gp_test_finished = cur_ops->get_gp_seq(); } if (shutdown) { @@ -515,8 +516,8 @@ rcu_perf_cleanup(void) t_rcu_perf_writer_finished - t_rcu_perf_writer_started, ngps, - rcuperf_seq_diff(b_rcu_perf_writer_finished, - b_rcu_perf_writer_started)); + rcuperf_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; @@ -584,6 +585,159 @@ rcu_perf_shutdown(void *arg) return -EINVAL; } +/* + * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number + * of iterations and measure total time and number of GP for all iterations to complete. + */ + +torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); +torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); +torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); + +static struct task_struct **kfree_reader_tasks; +static int kfree_nrealthreads; +static atomic_t n_kfree_perf_thread_started; +static atomic_t n_kfree_perf_thread_ended; + +struct kfree_obj { + char kfree_obj[8]; + struct rcu_head rh; +}; + +static int +kfree_perf_thread(void *arg) +{ + int i, loop = 0; + long me = (long)arg; + struct kfree_obj *alloc_ptr; + u64 start_time, end_time; + + VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + start_time = ktime_get_mono_fast_ns(); + + if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) { + if (gp_exp) + b_rcu_gp_test_started = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_started = cur_ops->get_gp_seq(); + } + + do { + for (i = 0; i < kfree_alloc_num; i++) { + alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + if (!alloc_ptr) + return -ENOMEM; + + kfree_rcu(alloc_ptr, rh); + } + + cond_resched(); + } while (!torture_must_stop() && ++loop < kfree_loops); + + if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) { + end_time = ktime_get_mono_fast_ns(); + + if (gp_exp) + b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_finished = cur_ops->get_gp_seq(); + + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n", + (unsigned long long)(end_time - start_time), kfree_loops, + rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started)); + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + + torture_kthread_stopping("kfree_perf_thread"); + return 0; +} + +static void +kfree_perf_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (kfree_reader_tasks) { + for (i = 0; i < kfree_nrealthreads; i++) + torture_stop_kthread(kfree_perf_thread, + kfree_reader_tasks[i]); + kfree(kfree_reader_tasks); + } + + torture_cleanup_end(); +} + +/* + * shutdown kthread. Just waits to be awakened, then shuts down system. + */ +static int +kfree_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_kfree_perf_thread_ended) >= + kfree_nrealthreads); + } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads); + + smp_mb(); /* Wake before output. */ + + kfree_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +kfree_perf_init(void) +{ + long i; + int firsterr = 0; + + kfree_nrealthreads = compute_real(kfree_nthreads); + /* Start up the kthreads. */ + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(kfree_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), + GFP_KERNEL); + if (kfree_reader_tasks == NULL) { + firsterr = -ENOMEM; + goto unwind; + } + + for (i = 0; i < kfree_nrealthreads; i++) { + firsterr = torture_create_kthread(kfree_perf_thread, (void *)i, + kfree_reader_tasks[i]); + if (firsterr) + goto unwind; + } + + while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads) + schedule_timeout_uninterruptible(1); + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + kfree_perf_cleanup(); + return firsterr; +} + static int __init rcu_perf_init(void) { @@ -616,6 +770,9 @@ rcu_perf_init(void) if (cur_ops->init) cur_ops->init(); + if (kfree_rcu_test) + return kfree_perf_init(); + nrealwriters = compute_real(nwriters); nrealreaders = compute_real(nreaders); atomic_set(&n_rcu_perf_reader_started, 0); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f77f4d886cc1..1aeecc165b21 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1740,7 +1740,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) // Give the scheduler a chance, even on nohz_full CPUs. static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) { - if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { // Real call_rcu() floods hit userspace, so emulate that. if (need_resched() || (iter & 0xfff)) schedule(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 44d6606b8325..6208c1dae5c9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Workqueue handler to drive one grace period and invoke any callbacks - * that become ready as a result. Single-CPU and !PREEMPT operation + * that become ready as a result. Single-CPU and !PREEMPTION operation * means that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5dffade2d7cd..657e6a7d1c03 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -530,7 +530,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(ssp); - ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) @@ -762,6 +762,7 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long flags; struct srcu_data *sdp; unsigned long t; + unsigned long tlast; /* If the local srcu_data structure has callbacks, not idle. */ local_irq_save(flags); @@ -780,9 +781,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); + tlast = READ_ONCE(ssp->srcu_last_gp_end); if (exp_holdoff == 0 || - time_in_range_open(t, ssp->srcu_last_gp_end, - ssp->srcu_last_gp_end + exp_holdoff)) + time_in_range_open(t, tlast, tlast + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ @@ -853,7 +854,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, local_irq_save(flags); sdp = this_cpu_ptr(ssp->sda); spin_lock_rcu_node(sdp); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); s = rcu_seq_snap(&ssp->srcu_gp_seq); @@ -1052,7 +1053,7 @@ void srcu_barrier(struct srcu_struct *ssp) sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head, 0)) { + &sdp->srcu_barrier_head)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&ssp->srcu_barrier_cpu_cnt); } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 477b4eb44af5..dd572ce7c747 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -22,6 +22,7 @@ #include <linux/time.h> #include <linux/cpu.h> #include <linux/prefetch.h> +#include <linux/slab.h> #include "rcu.h" @@ -73,6 +74,31 @@ void rcu_sched_clock_irq(int user) } } +/* + * Reclaim the specified callback, either by invoking it for non-kfree cases or + * freeing it directly (for kfree). Return true if kfreeing, false otherwise. + */ +static inline bool rcu_reclaim_tiny(struct rcu_head *head) +{ + rcu_callback_t f; + unsigned long offset = (unsigned long)head->func; + + rcu_lock_acquire(&rcu_callback_map); + if (__is_kfree_rcu_offset(offset)) { + trace_rcu_invoke_kfree_callback("", head, offset); + kfree((void *)head - offset); + rcu_lock_release(&rcu_callback_map); + return true; + } + + trace_rcu_invoke_callback("", head); + f = head->func; + WRITE_ONCE(head->func, (rcu_callback_t)0L); + f(head); + rcu_lock_release(&rcu_callback_map); + return false; +} + /* Invoke the RCU callbacks whose grace period has elapsed. */ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { @@ -100,7 +126,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim("", list); + rcu_reclaim_tiny(list); local_bh_enable(); list = next; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..d91c9156fab2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -43,7 +43,6 @@ #include <uapi/linux/sched/types.h> #include <linux/prefetch.h> #include <linux/delay.h> -#include <linux/stop_machine.h> #include <linux/random.h> #include <linux/trace_events.h> #include <linux/suspend.h> @@ -55,6 +54,7 @@ #include <linux/oom.h> #include <linux/smpboot.h> #include <linux/jiffies.h> +#include <linux/slab.h> #include <linux/sched/isolation.h> #include <linux/sched/clock.h> #include "../time/tick-internal.h" @@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; -struct rcu_state rcu_state = { +static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, @@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); * held, but the bit corresponding to the current CPU will be stable * in most contexts. */ -unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) { return READ_ONCE(rnp->qsmaskinitnext); } @@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -bool rcu_dynticks_curr_cpu_in_eqs(void) +static bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -int rcu_dynticks_snap(struct rcu_data *rdp) +static int rcu_dynticks_snap(struct rcu_data *rdp) { int snap = atomic_add_return(0, &rdp->dynticks); @@ -529,16 +529,6 @@ static struct rcu_node *rcu_get_root(void) } /* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ - if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) - return "???"; - return gp_state_names[gs]; -} - -/* * Send along grace-period-related data for rcutorture diagnostics. */ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, @@ -577,7 +567,7 @@ static void rcu_eqs_enter(bool user) } lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); @@ -650,14 +640,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); + trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, + atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (irq) @@ -744,7 +735,7 @@ static void rcu_eqs_exit(bool user) rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks); + trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); @@ -800,8 +791,8 @@ void rcu_user_exit(void) */ static __always_inline void rcu_nmi_enter_common(bool irq) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); long incby = 2; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); /* Complain about underflow. */ WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); @@ -828,12 +819,17 @@ static __always_inline void rcu_nmi_enter_common(bool irq) } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + raw_spin_lock_rcu_node(rdp->mynode); + // Recheck under lock. + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + rdp->rcu_forced_tick = true; + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, - rdp->dynticks_nmi_nesting + incby, rdp->dynticks); + rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); @@ -898,6 +894,7 @@ void rcu_irq_enter_irqson(void) */ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) { + raw_lockdep_assert_held_rcu_node(rdp->mynode); WRITE_ONCE(rdp->rcu_urgent_qs, false); WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { @@ -1934,7 +1931,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2146,7 +2143,6 @@ static void rcu_do_batch(struct rcu_data *rdp) /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), 0); trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), @@ -2168,7 +2164,6 @@ static void rcu_do_batch(struct rcu_data *rdp) if (unlikely(bl > 100)) tlimit = local_clock() + rcu_resched_ns; trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); if (offloaded) @@ -2179,9 +2174,19 @@ static void rcu_do_batch(struct rcu_data *rdp) tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + rcu_callback_t f; + debug_rcu_head_unqueue(rhp); - if (__rcu_reclaim(rcu_state.name, rhp)) - rcu_cblist_dequeued_lazy(&rcl); + + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_callback(rcu_state.name, rhp); + + f = rhp->func; + WRITE_ONCE(rhp->func, (rcu_callback_t)0L); + f(rhp); + + rcu_lock_release(&rcu_callback_map); + /* * Stop only if limit reached and CPU has something to do. * Note: The rcl structure counts down from zero. @@ -2294,7 +2299,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPTION) || + if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they @@ -2308,14 +2313,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; } - for_each_leaf_node_possible_cpu(rnp, cpu) { - unsigned long bit = leaf_node_cpu_bit(rnp, cpu); - if ((rnp->qsmask & bit) != 0) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (f(rdp)) { - mask |= bit; - rcu_disable_urgency_upon_qs(rdp); - } + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (f(rdp)) { + mask |= rdp->grpmask; + rcu_disable_urgency_upon_qs(rdp); } } if (mask != 0) { @@ -2474,8 +2476,8 @@ static void rcu_cpu_kthread(unsigned int cpu) char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); int spincnt; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); @@ -2583,7 +2585,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct rcu_data *rdp; @@ -2618,18 +2620,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); } + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ - rcu_segcblist_enqueue(&rdp->cblist, head, lazy); + rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, (unsigned long)func, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); else trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ @@ -2679,28 +2680,230 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 0); + __call_rcu(head, func); } EXPORT_SYMBOL_GPL(call_rcu); + +/* Maximum number of jiffies to wait before draining a batch. */ +#define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_N_BATCHES 2 + +/** + * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests + * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head_free: List of kfree_rcu() objects waiting for a grace period + * @krcp: Pointer to @kfree_rcu_cpu structure + */ + +struct kfree_rcu_cpu_work { + struct rcu_work rcu_work; + struct rcu_head *head_free; + struct kfree_rcu_cpu *krcp; +}; + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period + * @lock: Synchronize access to this structure + * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES + * @monitor_todo: Tracks whether a @monitor_work delayed work is pending + * @initialized: The @lock and @rcu_work fields have been initialized + * + * This is a per-CPU structure. The reason that it is not included in + * the rcu_data structure is to permit this code to be extracted from + * the RCU files. Such extraction could allow further optimization of + * the interactions with the slab allocators. + */ +struct kfree_rcu_cpu { + struct rcu_head *head; + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; + spinlock_t lock; + struct delayed_work monitor_work; + bool monitor_todo; + bool initialized; +}; + +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); + +/* + * This function is invoked in workqueue context after a grace period. + * It frees all the objects queued on ->head_free. + */ +static void kfree_rcu_work(struct work_struct *work) +{ + unsigned long flags; + struct rcu_head *head, *next; + struct kfree_rcu_cpu *krcp; + struct kfree_rcu_cpu_work *krwp; + + krwp = container_of(to_rcu_work(work), + struct kfree_rcu_cpu_work, rcu_work); + krcp = krwp->krcp; + spin_lock_irqsave(&krcp->lock, flags); + head = krwp->head_free; + krwp->head_free = NULL; + spin_unlock_irqrestore(&krcp->lock, flags); + + // List "head" is now private, so traverse locklessly. + for (; head; head = next) { + unsigned long offset = (unsigned long)head->func; + + next = head->next; + // Potentially optimize with kfree_bulk in future. + debug_rcu_head_unqueue(head); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + + if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) { + /* Could be optimized with kfree_bulk() in future. */ + kfree((void *)head - offset); + } + + rcu_lock_release(&rcu_callback_map); + cond_resched_tasks_rcu_qs(); + } +} + /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). + * Schedule the kfree batch RCU work to run in workqueue context after a GP. + * + * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES + * timeout has been reached. + */ +static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) +{ + int i; + struct kfree_rcu_cpu_work *krwp = NULL; + + lockdep_assert_held(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + if (!krcp->krw_arr[i].head_free) { + krwp = &(krcp->krw_arr[i]); + break; + } + + // If a previous RCU batch is in progress, we cannot immediately + // queue another one, so return false to tell caller to retry. + if (!krwp) + return false; + + krwp->head_free = krcp->head; + krcp->head = NULL; + INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work); + queue_rcu_work(system_wq, &krwp->rcu_work); + return true; +} + +static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, + unsigned long flags) +{ + // Attempt to start a new batch. + krcp->monitor_todo = false; + if (queue_kfree_rcu_work(krcp)) { + // Success! Our job is done here. + spin_unlock_irqrestore(&krcp->lock, flags); + return; + } + + // Previous RCU batch still in progress, try again later. + krcp->monitor_todo = true; + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * It invokes kfree_rcu_drain_unlock() to attempt to start another batch. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, + monitor_work.work); + + spin_lock_irqsave(&krcp->lock, flags); + if (krcp->monitor_todo) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * Queue a request for lazy invocation of kfree() after a grace period. + * + * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch + * will be kfree'd in workqueue context. This allows us to: + * + * 1. Batch requests together to reduce the number of grace periods during + * heavy kfree_rcu() load. + * + * 2. It makes it possible to use kfree_bulk() on a large number of + * kfree_rcu() requests thus reducing cache misses and the per-object + * overhead of kfree(). */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 1); + unsigned long flags; + struct kfree_rcu_cpu *krcp; + + local_irq_save(flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + if (krcp->initialized) + spin_lock(&krcp->lock); + + // Queue the object but don't yet schedule the batch. + if (debug_rcu_head_queue(head)) { + // Probable double kfree_rcu(), just leak. + WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", + __func__, head); + goto unlock_return; + } + head->func = func; + head->next = krcp->head; + krcp->head = head; + + // Set timer to drain after KFREE_DRAIN_JIFFIES. + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !krcp->monitor_todo) { + krcp->monitor_todo = true; + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + } + +unlock_return: + if (krcp->initialized) + spin_unlock(&krcp->lock); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kfree_call_rcu); +void __init kfree_rcu_scheduler_running(void) +{ + int cpu; + unsigned long flags; + + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_irqsave(&krcp->lock, flags); + if (!krcp->head || krcp->monitor_todo) { + spin_unlock_irqrestore(&krcp->lock, flags); + continue; + } + krcp->monitor_todo = true; + schedule_delayed_work_on(cpu, &krcp->monitor_work, + KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); + } +} + /* * During early boot, any blocking grace-period wait automatically - * implies a grace period. Later on, this is never the case for PREEMPT. + * implies a grace period. Later on, this is never the case for PREEMPTION. * - * Howevr, because a context switch is a grace period for !PREEMPT, any + * Howevr, because a context switch is a grace period for !PREEMPTION, any * blocking grace-period wait automatically implies a grace period if * there is only one CPU online at any point time during execution of * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to @@ -2896,7 +3099,7 @@ static void rcu_barrier_func(void *unused) debug_rcu_head_queue(&rdp->barrier_head); rcu_nocb_lock(rdp); WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); - if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); @@ -3557,12 +3760,29 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; struct workqueue_struct *rcu_par_gp_wq; +static void __init kfree_rcu_batch_init(void) +{ + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_init(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + krcp->krw_arr[i].krcp = krcp; + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + krcp->initialized = true; + } +} + void __init rcu_init(void) { int cpu; rcu_early_boot_tests(); + kfree_rcu_batch_init(); rcu_bootup_announce(); rcu_init_geometry(); rcu_init_one(); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..0c87e4c161c2 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -16,7 +16,6 @@ #include <linux/cpumask.h> #include <linux/seqlock.h> #include <linux/swait.h> -#include <linux/stop_machine.h> #include <linux/rcu_node_tree.h> #include "rcu_segcblist.h" @@ -182,8 +181,8 @@ struct rcu_data { bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ + bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ #ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ @@ -368,18 +367,6 @@ struct rcu_state { #define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ -static const char * const gp_state_names[] = { - "RCU_GP_IDLE", - "RCU_GP_WAIT_GPS", - "RCU_GP_DONE_GPS", - "RCU_GP_ONOFF", - "RCU_GP_INIT", - "RCU_GP_WAIT_FQS", - "RCU_GP_DOING_FQS", - "RCU_GP_CLEANUP", - "RCU_GP_CLEANED", -}; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. @@ -403,8 +390,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; #define RCU_NAME rcu_name #endif /* #else #ifdef CONFIG_TRACING */ -int rcu_dynticks_snap(struct rcu_data *rdp); - /* Forward declarations for tree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); @@ -415,7 +400,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); -void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d632cd019597..6935a9e2b094 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -21,7 +21,7 @@ static void rcu_exp_gp_seq_start(void) } /* - * Return then value that expedited-grace-period counter will have + * Return the value that the expedited-grace-period counter will have * at the end of the current grace period. */ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) @@ -39,7 +39,9 @@ static void rcu_exp_gp_seq_end(void) } /* - * Take a snapshot of the expedited-grace-period counter. + * Take a snapshot of the expedited-grace-period counter, which is the + * earliest value that will indicate that a full grace period has + * elapsed since the current time. */ static unsigned long rcu_exp_gp_seq_snap(void) { @@ -134,7 +136,7 @@ static void __maybe_unused sync_exp_reset_tree(void) rcu_for_each_node_breadth_first(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; + WRITE_ONCE(rnp->expmask, rnp->expmaskinit); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -143,31 +145,26 @@ static void __maybe_unused sync_exp_reset_tree(void) * Return non-zero if there is no RCU expedited grace period in progress * for the specified rcu_node structure, in other words, if all CPUs and * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the specificed rcu_node structure's ->lock + * for the current expedited grace period. */ -static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) +static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } /* - * Like sync_rcu_preempt_exp_done(), but this function assumes the caller - * doesn't hold the rcu_node's ->lock, and will acquire and release the lock - * itself + * Like sync_rcu_exp_done(), but where the caller does not hold the + * rcu_node's ->lock. */ -static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) { unsigned long flags; bool ret; raw_spin_lock_irqsave_rcu_node(rnp, flags); - ret = sync_rcu_preempt_exp_done(rnp); + ret = sync_rcu_exp_done(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ret; @@ -181,8 +178,6 @@ static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) * which the task was queued or to one of that rcu_node structure's ancestors, * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) - * - * Caller must hold the specified rcu_node structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_node *rnp, bool wake, unsigned long flags) @@ -190,8 +185,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, { unsigned long mask; + raw_lockdep_assert_held_rcu_node(rnp); for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { + if (!sync_rcu_exp_done(rnp)) { if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else @@ -211,7 +207,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, rnp = rnp->parent; raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; + WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); } } @@ -234,14 +230,23 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long mask, bool wake) { + int cpu; unsigned long flags; + struct rcu_data *rdp; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - rnp->expmask &= ~mask; + WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); + for_each_leaf_node_cpu_mask(rnp, cpu, mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp) + continue; + rdp->rcu_forced_tick_exp = false; + tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + } __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ } @@ -345,8 +350,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long mask = rdp->grpmask; int snap; if (raw_smp_processor_id() == cpu || @@ -372,12 +377,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long mask = rdp->grpmask; - if (!(mask_ofl_ipi & mask)) - continue; retry_ipi: if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; @@ -389,10 +392,10 @@ retry_ipi: } ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); put_cpu(); - if (!ret) { - mask_ofl_ipi &= ~mask; + /* The CPU will report the QS in response to the IPI. */ + if (!ret) continue; - } + /* Failed, raced with CPU hotplug operation. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rnp->qsmaskinitnext & mask) && @@ -403,13 +406,12 @@ retry_ipi: schedule_timeout_uninterruptible(1); goto retry_ipi; } - /* CPU really is offline, so we can ignore it. */ - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + /* CPU really is offline, so we must report its QS. */ + if (rnp->expmask & mask) + mask_ofl_test |= mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; if (mask_ofl_test) rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } @@ -456,29 +458,62 @@ static void sync_rcu_exp_select_cpus(void) flush_work(&rnp->rew.rew_work); } -static void synchronize_sched_expedited_wait(void) +/* + * Wait for the expedited grace period to elapse, within time limit. + * If the time limit is exceeded without the grace period elapsing, + * return false, otherwise return true. + */ +static bool synchronize_rcu_expedited_wait_once(long tlimit) +{ + int t; + struct rcu_node *rnp_root = rcu_get_root(); + + t = swait_event_timeout_exclusive(rcu_state.expedited_wq, + sync_rcu_exp_done_unlocked(rnp_root), + tlimit); + // Workqueues should not be signaled. + if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root)) + return true; + WARN_ON(t < 0); /* workqueues should not be signaled. */ + return false; +} + +/* + * Wait for the expedited grace period to elapse, issuing any needed + * RCU CPU stall warnings along the way. + */ +static void synchronize_rcu_expedited_wait(void) { int cpu; unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; int ndetected; + struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(); - int ret; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; + if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (synchronize_rcu_expedited_wait_once(1)) + return; + rcu_for_each_leaf_node(rnp) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->rcu_forced_tick_exp) + continue; + rdp->rcu_forced_tick_exp = true; + tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + } + } + WARN_ON_ONCE(1); + } for (;;) { - ret = swait_event_timeout_exclusive( - rcu_state.expedited_wq, - sync_rcu_preempt_exp_done_unlocked(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; - WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) continue; panic_on_rcu_stall(); @@ -491,7 +526,7 @@ static void synchronize_sched_expedited_wait(void) struct rcu_data *rdp; mask = leaf_node_cpu_bit(rnp, cpu); - if (!(rnp->expmask & mask)) + if (!(READ_ONCE(rnp->expmask) & mask)) continue; ndetected++; rdp = per_cpu_ptr(&rcu_data, cpu); @@ -503,17 +538,18 @@ static void synchronize_sched_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + READ_ONCE(rnp_root->expmask), + ".T"[!!rnp_root->exp_tasks]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done_unlocked(rnp)) + if (sync_rcu_exp_done_unlocked(rnp)) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, + READ_ONCE(rnp->expmask), ".T"[!!rnp->exp_tasks]); } pr_cont("\n"); @@ -521,7 +557,7 @@ static void synchronize_sched_expedited_wait(void) rcu_for_each_leaf_node(rnp) { for_each_leaf_node_possible_cpu(rnp, cpu) { mask = leaf_node_cpu_bit(rnp, cpu); - if (!(rnp->expmask & mask)) + if (!(READ_ONCE(rnp->expmask) & mask)) continue; dump_cpu_task(cpu); } @@ -540,15 +576,14 @@ static void rcu_exp_wait_wake(unsigned long s) { struct rcu_node *rnp; - synchronize_sched_expedited_wait(); - rcu_exp_gp_seq_end(); - trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); + synchronize_rcu_expedited_wait(); - /* - * Switch over to wakeup mode, allowing the next GP, but -only- the - * next GP, to proceed. - */ + // Switch over to wakeup mode, allowing the next GP to proceed. + // End the previous grace period only after acquiring the mutex + // to ensure that only one GP runs concurrently with wakeups. mutex_lock(&rcu_state.exp_wake_mutex); + rcu_exp_gp_seq_end(); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { @@ -559,7 +594,7 @@ static void rcu_exp_wait_wake(unsigned long s) spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ - wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]); } trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake")); mutex_unlock(&rcu_state.exp_wake_mutex); @@ -610,7 +645,7 @@ static void rcu_exp_handler(void *unused) * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!t->rcu_read_lock_nesting) { + if (!rcu_preempt_depth()) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); @@ -634,7 +669,7 @@ static void rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (t->rcu_read_lock_nesting > 0) { + if (rcu_preempt_depth() > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->exp_deferred_qs = true; @@ -670,7 +705,7 @@ static void rcu_exp_handler(void *unused) } } -/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */ +/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ static void sync_sched_exp_online_cleanup(int cpu) { } @@ -785,7 +820,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * implementations, it is still unfriendly to real-time workloads, so is * thus not recommended for any sort of common-case code. In fact, if * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() + * your code to batch your updates, and then use a single synchronize_rcu() * instead. * * This has the same semantics as (but is more brutal than) synchronize_rcu(). diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..c6ea81cd4189 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -220,7 +220,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * blocked tasks. */ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { - rnp->gp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) @@ -290,8 +290,8 @@ void rcu_note_context_switch(bool preempt) trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); - WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); - if (t->rcu_read_lock_nesting > 0 && + WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0); + if (rcu_preempt_depth() > 0 && !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ @@ -340,7 +340,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { - return rnp->gp_tasks != NULL; + return READ_ONCE(rnp->gp_tasks) != NULL; } /* Bias and limit values for ->rcu_read_lock_nesting. */ @@ -348,6 +348,21 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) #define RCU_NEST_NMAX (-INT_MAX / 2) #define RCU_NEST_PMAX (INT_MAX / 2) +static void rcu_preempt_read_enter(void) +{ + current->rcu_read_lock_nesting++; +} + +static void rcu_preempt_read_exit(void) +{ + current->rcu_read_lock_nesting--; +} + +static void rcu_preempt_depth_set(int val) +{ + current->rcu_read_lock_nesting = val; +} + /* * Preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated @@ -355,9 +370,9 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) */ void __rcu_read_lock(void) { - current->rcu_read_lock_nesting++; + rcu_preempt_read_enter(); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) - WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX); + WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -373,19 +388,19 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (t->rcu_read_lock_nesting != 1) { - --t->rcu_read_lock_nesting; + if (rcu_preempt_depth() != 1) { + rcu_preempt_read_exit(); } else { barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = -RCU_NEST_BIAS; + rcu_preempt_depth_set(-RCU_NEST_BIAS); barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; + rcu_preempt_depth_set(0); } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { - int rrln = t->rcu_read_lock_nesting; + int rrln = rcu_preempt_depth(); WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); } @@ -444,15 +459,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } - t->rcu_read_unlock_special.b.deferred_qs = false; - if (special.b.need_qs) { + t->rcu_read_unlock_special.s = 0; + if (special.b.need_qs) rcu_qs(); - t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) { - local_irq_restore(flags); - return; - } - } /* * Respond to a request by an expedited grace period for a @@ -460,17 +469,11 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->exp_deferred_qs) { + if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); - if (!t->rcu_read_unlock_special.s) { - local_irq_restore(flags); - return; - } - } /* Clean up if blocked during RCU read-side critical section. */ if (special.b.blocked) { - t->rcu_read_unlock_special.b.blocked = false; /* * Remove this task from the list it blocked on. The task @@ -485,7 +488,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); - empty_exp = sync_rcu_preempt_exp_done(rnp); + empty_exp = sync_rcu_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); @@ -493,7 +496,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), rnp->gp_seq, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) - rnp->gp_tasks = np; + WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; if (IS_ENABLED(CONFIG_RCU_BOOST)) { @@ -509,7 +512,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ - empty_exp_now = sync_rcu_preempt_exp_done(rnp); + empty_exp_now = sync_rcu_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gp_seq, @@ -551,7 +554,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && - t->rcu_read_lock_nesting <= 0; + rcu_preempt_depth() <= 0; } /* @@ -564,16 +567,16 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = t->rcu_read_lock_nesting >= 0; + bool couldrecurse = rcu_preempt_depth() >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; if (couldrecurse) - t->rcu_read_lock_nesting -= RCU_NEST_BIAS; + rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS); local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); if (couldrecurse) - t->rcu_read_lock_nesting += RCU_NEST_BIAS; + rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS); } /* @@ -610,9 +613,8 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - t->rcu_read_unlock_special.b.exp_hint = false; exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || - (rdp->grpmask & rnp->expmask) || + (rdp->grpmask & READ_ONCE(rnp->expmask)) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. if (irqs_were_disabled && use_softirq && @@ -640,7 +642,6 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_restore(flags); return; } - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); rcu_preempt_deferred_qs_irqrestore(t, flags); } @@ -648,8 +649,7 @@ static void rcu_read_unlock_special(struct task_struct *t) * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be - * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock - * must be held by the caller. + * invoked -before- updating this rnp's ->gp_seq. * * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. @@ -659,11 +659,12 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); + raw_lockdep_assert_held_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { - rnp->gp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next); t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), @@ -686,7 +687,7 @@ static void rcu_flavor_sched_clock_irq(int user) if (user || rcu_is_cpu_rrupt_from_idle()) { rcu_note_voluntary_context_switch(current); } - if (t->rcu_read_lock_nesting > 0 || + if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ if (rcu_preempt_need_deferred_qs(t)) { @@ -696,13 +697,13 @@ static void rcu_flavor_sched_clock_irq(int user) } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!t->rcu_read_lock_nesting) { + } else if (!rcu_preempt_depth()) { rcu_qs(); /* Report immediate QS. */ return; } /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ - if (t->rcu_read_lock_nesting > 0 && + if (rcu_preempt_depth() > 0 && __this_cpu_read(rcu_data.core_needs_qs) && __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && !t->rcu_read_unlock_special.b.need_qs && @@ -723,11 +724,11 @@ void exit_rcu(void) struct task_struct *t = current; if (unlikely(!list_empty(¤t->rcu_node_entry))) { - t->rcu_read_lock_nesting = 1; + rcu_preempt_depth_set(1); barrier(); WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); - } else if (unlikely(t->rcu_read_lock_nesting)) { - t->rcu_read_lock_nesting = 1; + } else if (unlikely(rcu_preempt_depth())) { + rcu_preempt_depth_set(1); } else { return; } @@ -757,7 +758,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); + __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, + rnp->exp_tasks); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { @@ -788,7 +790,7 @@ static void __init rcu_bootup_announce(void) } /* - * Note a quiescent state for PREEMPT=n. Because we do not need to know + * Note a quiescent state for PREEMPTION=n. Because we do not need to know * how many quiescent states passed, just if there was at least one since * the start of the grace period, this just sets a flag. The caller must * have disabled preemption. @@ -838,7 +840,7 @@ void rcu_all_qs(void) EXPORT_SYMBOL_GPL(rcu_all_qs); /* - * Note a PREEMPT=n context switch. The caller must have disabled interrupts. + * Note a PREEMPTION=n context switch. The caller must have disabled interrupts. */ void rcu_note_context_switch(bool preempt) { @@ -1262,10 +1264,9 @@ static void rcu_prepare_for_idle(void) /* * This code is invoked when a CPU goes idle, at which point we want * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. This is handled by a - * state machine implemented by rcu_prepare_for_idle() below. + * the energy-efficient dyntick-idle mode. * - * The following three proprocessor symbols control this state machine: + * The following preprocessor symbol controls this: * * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted * to sleep in dyntick-idle mode with RCU callbacks pending. This @@ -1274,21 +1275,15 @@ static void rcu_prepare_for_idle(void) * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your * system. And if you are -that- concerned about energy efficiency, * just power the system down and be done with it! - * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is - * permitted to sleep in dyntick-idle mode with only lazy RCU - * callbacks pending. Setting this too high can OOM your system. * - * The values below work well in practice. If future workloads require + * The value below works well in practice. If future workloads require * adjustment, they can be converted into kernel config parameters, though * making the state machine smarter might be a better option. */ #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ -#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; module_param(rcu_idle_gp_delay, int, 0644); -static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; -module_param(rcu_idle_lazy_gp_delay, int, 0644); /* * Try to advance callbacks on the current CPU, but only if it has been @@ -1327,8 +1322,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) /* * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller to set the timeout based on whether or not there are non-lazy - * callbacks. + * caller about what to set the timeout. * * The caller must have disabled interrupts. */ @@ -1354,25 +1348,18 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) } rdp->last_accelerate = jiffies; - /* Request timer delay depending on laziness, and round. */ - rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); - if (rdp->all_lazy) { - dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; - } else { - dj = round_up(rcu_idle_gp_delay + jiffies, - rcu_idle_gp_delay) - jiffies; - } + /* Request timer and round. */ + dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; + *nextevt = basemono + dj * TICK_NSEC; return 0; } /* - * Prepare a CPU for idle from an RCU perspective. The first major task - * is to sense whether nohz mode has been enabled or disabled via sysfs. - * The second major task is to check to see if a non-lazy callback has - * arrived at a CPU that previously had only lazy callbacks. The third - * major task is to accelerate (that is, assign grace-period numbers to) - * any recently arrived callbacks. + * Prepare a CPU for idle from an RCU perspective. The first major task is to + * sense whether nohz mode has been enabled or disabled via sysfs. The second + * major task is to accelerate (that is, assign grace-period numbers to) any + * recently arrived callbacks. * * The caller must have disabled interrupts. */ @@ -1399,17 +1386,6 @@ static void rcu_prepare_for_idle(void) return; /* - * If a non-lazy callback arrived at a CPU having only lazy - * callbacks, invoke RCU core for the side-effect of recalculating - * idle duration on re-entry to idle. - */ - if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { - rdp->all_lazy = false; - invoke_rcu_core(); - return; - } - - /* * If we have not yet accelerated this jiffy, accelerate all * callbacks on this CPU. */ @@ -2321,6 +2297,8 @@ static void __init rcu_organize_nocb_kthreads(void) { int cpu; bool firsttime = true; + bool gotnocbs = false; + bool gotnocbscbs = true; int ls = rcu_nocb_gp_stride; int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; @@ -2343,21 +2321,31 @@ static void __init rcu_organize_nocb_kthreads(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { /* New GP kthread, set up for CBs & next GP. */ + gotnocbs = true; nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; rdp->nocb_gp_rdp = rdp; rdp_gp = rdp; - if (!firsttime && dump_tree) - pr_cont("\n"); - firsttime = false; - pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu); + if (dump_tree) { + if (!firsttime) + pr_cont("%s\n", gotnocbscbs + ? "" : " (self only)"); + gotnocbscbs = false; + firsttime = false; + pr_alert("%s: No-CB GP kthread CPU %d:", + __func__, cpu); + } } else { /* Another CB kthread, link to previous GP kthread. */ + gotnocbscbs = true; rdp->nocb_gp_rdp = rdp_gp; rdp_prev->nocb_next_cb_rdp = rdp; - pr_alert(" %d", cpu); + if (dump_tree) + pr_cont(" %d", cpu); } rdp_prev = rdp; } + if (gotnocbs && dump_tree) + pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); } /* diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..55f9b84790d3 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp) // // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPTION +#ifdef CONFIG_PREEMPT_RCU /* * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPTION */ +#else /* #ifdef CONFIG_PREEMPT_RCU */ /* * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) { return 0; } -#endif /* #else #ifdef CONFIG_PREEMPTION */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* * Dump stacks of all tasks running on stalled CPUs. First try using @@ -263,11 +263,9 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", + sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ".l"[rdp->all_lazy], - ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], - ".D"[!!rdp->tick_nohz_enabled_snap]); + !!rdp->tick_nohz_enabled_snap); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -279,6 +277,28 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ +static const char * const gp_state_names[] = { + [RCU_GP_IDLE] = "RCU_GP_IDLE", + [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS", + [RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS", + [RCU_GP_ONOFF] = "RCU_GP_ONOFF", + [RCU_GP_INIT] = "RCU_GP_INIT", + [RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS", + [RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS", + [RCU_GP_CLEANUP] = "RCU_GP_CLEANUP", + [RCU_GP_CLEANED] = "RCU_GP_CLEANED", +}; + +/* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + /* * Print out diagnostic information for the specified stalled CPU. * diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..6c4b862f57d6 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -40,6 +40,7 @@ #include <linux/rcupdate_wait.h> #include <linux/sched/isolation.h> #include <linux/kprobes.h> +#include <linux/slab.h> #define CREATE_TRACE_POINTS @@ -51,9 +52,7 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU -extern int rcu_expedited; /* from sysctl */ module_param(rcu_expedited, int, 0); -extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); @@ -218,6 +217,7 @@ static int __init rcu_set_runtime_mode(void) { rcu_test_sync_prims(); rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + kfree_rcu_scheduler_running(); rcu_test_sync_prims(); return 0; } @@ -435,7 +435,7 @@ struct debug_obj_descr rcuhead_debug_descr = { EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_RCU_TRACE) void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) @@ -853,14 +853,22 @@ static void test_callback(struct rcu_head *r) DEFINE_STATIC_SRCU(early_srcu); +struct early_boot_kfree_rcu { + struct rcu_head rh; +}; + static void early_boot_test_call_rcu(void) { static struct rcu_head head; static struct rcu_head shead; + struct early_boot_kfree_rcu *rhp; call_rcu(&head, test_callback); if (IS_ENABLED(CONFIG_SRCU)) call_srcu(&early_srcu, &shead, test_callback); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (!WARN_ON_ONCE(!rhp)) + kfree_rcu(rhp, rh); } void rcu_early_boot_tests(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 70665934d53e..d396aaaf19a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1268,7 +1268,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_do_static_key, }, #endif -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) { .procname = "panic_on_rcu_stall", .data = &sysctl_panic_on_rcu_stall, diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 990a872cec46..c8c47fc72653 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -257,9 +257,6 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, #define tipc_aead_rcu_ptr(rcu_ptr, lock) \ rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) -#define tipc_aead_rcu_swap(rcu_ptr, ptr, lock) \ - rcu_swap_protected((rcu_ptr), (ptr), lockdep_is_held(lock)) - #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ do { \ typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \ @@ -1189,7 +1186,7 @@ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) /* Move passive key if any */ if (key.passive) { - tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); + tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, lockdep_is_held(&rx->lock)); x = (key.passive - key.pending + new_pending) % KEY_MAX; new_passive = (x <= 0) ? x + KEY_MAX : x; } |