From 559f9badd11ddf399f88b18b4c0f110fd511ae53 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 14 Mar 2012 22:17:39 -0400 Subject: rcu: List-debug variants of rcu list routines. * Make __list_add_rcu check the next->prev and prev->next pointers just like __list_add does. * Make list_del_rcu use __list_del_entry, which does the same checking at deletion time. Has been running for a week here without anything being tripped up, but it seems worth adding for completeness just in case something ever does corrupt those lists. Signed-off-by: Dave Jones Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 7 ++++++- lib/list_debug.c | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index d079290843a9..a20c05096231 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -30,6 +30,7 @@ * This is only for internal list manipulation where we know * the prev/next entries already! */ +#ifndef CONFIG_DEBUG_LIST static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { @@ -38,6 +39,10 @@ static inline void __list_add_rcu(struct list_head *new, rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } +#else +extern void __list_add_rcu(struct list_head *new, + struct list_head *prev, struct list_head *next); +#endif /** * list_add_rcu - add a new entry to rcu-protected list @@ -108,7 +113,7 @@ static inline void list_add_tail_rcu(struct list_head *new, */ static inline void list_del_rcu(struct list_head *entry) { - __list_del(entry->prev, entry->next); + __list_del_entry(entry); entry->prev = LIST_POISON2; } diff --git a/lib/list_debug.c b/lib/list_debug.c index 982b850d4e7a..3810b481f940 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -10,6 +10,7 @@ #include #include #include +#include /* * Insert a new entry between two known consecutive entries. @@ -75,3 +76,24 @@ void list_del(struct list_head *entry) entry->prev = LIST_POISON2; } EXPORT_SYMBOL(list_del); + +/* + * RCU variants. + */ +void __list_add_rcu(struct list_head *new, + struct list_head *prev, struct list_head *next) +{ + WARN(next->prev != prev, + "list_add_rcu corruption. next->prev should be " + "prev (%p), but was %p. (next=%p).\n", + prev, next->prev, next); + WARN(prev->next != next, + "list_add_rcu corruption. prev->next should be " + "next (%p), but was %p. (prev=%p).\n", + next, prev->next, prev); + new->next = next; + new->prev = prev; + rcu_assign_pointer(list_next_rcu(prev), new); + next->prev = new; +} +EXPORT_SYMBOL(__list_add_rcu); -- cgit v1.2.3 From f88022a4f650ac1778cafcc17d2e522283bdf590 Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 10 Apr 2012 14:07:40 -0400 Subject: rcu: Replace list_first_entry_rcu() with list_first_or_null_rcu() The list_first_entry_rcu() macro is inherently unsafe because it cannot be applied to an empty list. But because RCU readers do not exclude updaters, a list might become empty between the time that list_empty() claimed it was non-empty and the time that list_first_entry_rcu() is invoked. Therefore, the list_empty() test cannot be separated from the list_first_entry_rcu() call. This commit therefore combines these to macros to create a new list_first_or_null_rcu() macro that replaces the old (and unsafe) list_first_entry_rcu() macro. This patch incorporates Paul's review comments on the previous version of this patch available here: https://lkml.org/lkml/2012/4/2/536 This patch cannot break any upstream code because list_first_entry_rcu() is not being used anywhere in the kernel (tested with grep(1)), and any external code using it is probably broken as a result of using it. Signed-off-by: Michel Machado CC: "Paul E. McKenney" CC: Dipankar Sarma Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index a20c05096231..e0f0fab20415 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -233,18 +233,43 @@ static inline void list_splice_init_rcu(struct list_head *list, }) /** - * list_first_entry_rcu - get the first element from a list + * Where are list_empty_rcu() and list_first_entry_rcu()? + * + * Implementing those functions following their counterparts list_empty() and + * list_first_entry() is not advisable because they lead to subtle race + * conditions as the following snippet shows: + * + * if (!list_empty_rcu(mylist)) { + * struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member); + * do_something(bar); + * } + * + * The list may not be empty when list_empty_rcu checks it, but it may be when + * list_first_entry_rcu rereads the ->next pointer. + * + * Rereading the ->next pointer is not a problem for list_empty() and + * list_first_entry() because they would be protected by a lock that blocks + * writers. + * + * See list_first_or_null_rcu for an alternative. + */ + +/** + * list_first_or_null_rcu - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_struct within the struct. * - * Note, that list is expected to be not empty. + * Note that if the list is empty, it returns NULL. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ -#define list_first_entry_rcu(ptr, type, member) \ - list_entry_rcu((ptr)->next, type, member) +#define list_first_or_null_rcu(ptr, type, member) \ + ({struct list_head *__ptr = (ptr); \ + struct list_head __rcu *__next = list_next_rcu(__ptr); \ + likely(__ptr != __next) ? container_of(__next, type, member) : NULL; \ + }) /** * list_for_each_entry_rcu - iterate over rcu list of given type -- cgit v1.2.3 From c9336643e1440f4dfc89ad4ac6185813619abb8c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Apr 2012 16:20:18 -0700 Subject: rcu: Clarify help text for RCU_BOOST_PRIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old text confused real-time applications with real-time threads, so that you pretty much needed to understand how this kernel configuration parameter worked to understand the help text. This commit therefore attempts to make the help text human-readable. Reported-by: Jörn Engel Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- init/Kconfig | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 6cfd71d06463..85c6870ed476 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -515,10 +515,25 @@ config RCU_BOOST_PRIO depends on RCU_BOOST default 1 help - This option specifies the real-time priority to which preempted - RCU readers are to be boosted. If you are working with CPU-bound - real-time applications, you should specify a priority higher then - the highest-priority CPU-bound application. + This option specifies the real-time priority to which long-term + preempted RCU readers are to be boosted. If you are working + with a real-time application that has one or more CPU-bound + threads running at a real-time priority level, you should set + RCU_BOOST_PRIO to a priority higher then the highest-priority + real-time CPU-bound thread. The default RCU_BOOST_PRIO value + of 1 is appropriate in the common case, which is real-time + applications that do not have any CPU-bound threads. + + Some real-time applications might not have a single real-time + thread that saturates a given CPU, but instead might have + multiple real-time threads that, taken together, fully utilize + that CPU. In this case, you should set RCU_BOOST_PRIO to + a priority higher than the lowest-priority thread that is + conspiring to prevent the CPU from running any non-real-time + tasks. For example, if one thread at priority 10 and another + thread at priority 5 are between themselves fully consuming + the CPU time on a given CPU, then RCU_BOOST_PRIO should be + set to priority 6 or higher. Specify the real-time priority, or take the default if unsure. -- cgit v1.2.3 From d8169d4c369e8aa2fda10df705a4957331b5a4db Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Thu, 19 Apr 2012 11:44:39 -0700 Subject: rcu: Make __kfree_rcu() less dependent on compiler choices Currently, __kfree_rcu() is implemented as an inline function, and contains a BUILD_BUG_ON() that malfunctions if __kfree_rcu() is compiled as an out-of-line function. Unfortunately, there are compiler settings (e.g., -O0) that can result in __kfree_rcu() being compiled out of line, resulting in annoying build breakage. This commit therefore converts both __kfree_rcu() and __is_kfree_rcu_offset() from inline functions to macros to prevent such misbehavior on the part of the compiler. Signed-off-by: Jan Engelhardt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 20fb776a1d4a..d5dfb109dfe1 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -922,6 +922,21 @@ void __kfree_rcu(struct rcu_head *head, unsigned long offset) kfree_call_rcu(head, (rcu_callback)offset); } +/* + * Does the specified offset indicate that the corresponding rcu_head + * structure can be handled by kfree_rcu()? + */ +#define __is_kfree_rcu_offset(offset) ((offset) < 4096) + +/* + * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. + */ +#define __kfree_rcu(head, offset) \ + do { \ + BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \ + call_rcu(head, (void (*)(struct rcu_head *))(unsigned long)(offset)); \ + } while (0) + /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree @@ -944,6 +959,9 @@ void __kfree_rcu(struct rcu_head *head, unsigned long offset) * * Note that the allowable offset might decrease in the future, for example, * to allow something like kmem_cache_free_rcu(). + * + * The BUILD_BUG_ON check must not involve any function calls, hence the + * checks are done in macros here. */ #define kfree_rcu(ptr, rcu_head) \ __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) -- cgit v1.2.3 From 8932a63d5edb02f714d50c26583152fe0a97a69c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2012 12:20:14 -0700 Subject: rcu: Reduce cache-miss initialization latencies for large systems Commit #0209f649 (rcu: limit rcu_node leaf-level fanout) set an upper limit of 16 on the leaf-level fanout for the rcu_node tree. This was needed to reduce lock contention that was induced by the synchronization of scheduling-clock interrupts, which was in turn needed to improve energy efficiency for moderate-sized lightly loaded servers. However, reducing the leaf-level fanout means that there are more leaf-level rcu_node structures in the tree, which in turn means that RCU's grace-period initialization incurs more cache misses. This is not a problem on moderate-sized servers with only a few tens of CPUs, but becomes a major source of real-time latency spikes on systems with many hundreds of CPUs. In addition, the workloads running on these large systems tend to be CPU-bound, which eliminates the energy-efficiency advantages of synchronizing scheduling-clock interrupts. Therefore, these systems need maximal values for the rcu_node leaf-level fanout. This commit addresses this problem by introducing a new kernel parameter named RCU_FANOUT_LEAF that directly controls the leaf-level fanout. This parameter defaults to 16 to handle the common case of a moderate sized lightly loaded servers, but may be set higher on larger systems. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney --- init/Kconfig | 27 +++++++++++++++++++++++++++ kernel/rcutree.c | 2 +- kernel/rcutree.h | 10 +++------- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 85c6870ed476..6d18ef8071b5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -458,6 +458,33 @@ config RCU_FANOUT Select a specific number if testing RCU itself. Take the default if unsure. +config RCU_FANOUT_LEAF + int "Tree-based hierarchical RCU leaf-level fanout value" + range 2 RCU_FANOUT if 64BIT + range 2 RCU_FANOUT if !64BIT + depends on TREE_RCU || TREE_PREEMPT_RCU + default 16 + help + This option controls the leaf-level fanout of hierarchical + implementations of RCU, and allows trading off cache misses + against lock contention. Systems that synchronize their + scheduling-clock interrupts for energy-efficiency reasons will + want the default because the smaller leaf-level fanout keeps + lock contention levels acceptably low. Very large systems + (hundreds or thousands of CPUs) will instead want to set this + value to the maximum value possible in order to reduce the + number of cache misses incurred during RCU's grace-period + initialization. These systems tend to run CPU-bound, and thus + are not helped by synchronized interrupts, and thus tend to + skew them, which reduces lock contention enough that large + leaf-level fanouts work well. + + Select a specific number if testing RCU itself. + + Select the maximum permissible value for large systems. + + Take the default if unsure. + config RCU_FANOUT_EXACT bool "Disable tree-based hierarchical RCU auto-balancing" depends on TREE_RCU || TREE_PREEMPT_RCU diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1050d6d3922c..780acf8e15e9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2418,7 +2418,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) for (i = NUM_RCU_LVLS - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = RCU_FANOUT_LEAF; + rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a4072..a905c200405c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -29,18 +29,14 @@ #include /* - * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. + * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and + * CONFIG_RCU_FANOUT_LEAF. * In theory, it should be possible to add more levels straightforwardly. * In practice, this did work well going from three levels to four. * Of course, your mileage may vary. */ #define MAX_RCU_LVLS 4 -#if CONFIG_RCU_FANOUT > 16 -#define RCU_FANOUT_LEAF 16 -#else /* #if CONFIG_RCU_FANOUT > 16 */ -#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) -#endif /* #else #if CONFIG_RCU_FANOUT > 16 */ -#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) -- cgit v1.2.3 From dabb8aa96020bde8359bc73e76c484dd7ff9b7f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Apr 2012 10:54:45 -0700 Subject: rcu: Document kernel command-line parameters Bring RCU's kernel command-line parameter documentation up to date. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 88 +++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c1601e5a8b71..ab84a01c8d68 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2330,18 +2330,100 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes See Documentation/blockdev/ramdisk.txt. - rcupdate.blimit= [KNL,BOOT] + rcutree.blimit= [KNL,BOOT] Set maximum number of finished RCU callbacks to process in one batch. - rcupdate.qhimark= [KNL,BOOT] + rcutree.qhimark= [KNL,BOOT] Set threshold of queued RCU callbacks over which batch limiting is disabled. - rcupdate.qlowmark= [KNL,BOOT] + rcutree.qlowmark= [KNL,BOOT] Set threshold of queued RCU callbacks below which batch limiting is re-enabled. + rcutree.rcu_cpu_stall_suppress= [KNL,BOOT] + Suppress RCU CPU stall warning messages. + + rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] + Set timeout for RCU CPU stall warning messages. + + rcutorture.fqs_duration= [KNL,BOOT] + Set duration of force_quiescent_state bursts. + + rcutorture.fqs_holdoff= [KNL,BOOT] + Set holdoff time within force_quiescent_state bursts. + + rcutorture.fqs_stutter= [KNL,BOOT] + Set wait time between force_quiescent_state bursts. + + rcutorture.irqreader= [KNL,BOOT] + Test RCU readers from irq handlers. + + rcutorture.n_barrier_cbs= [KNL,BOOT] + Set callbacks/threads for rcu_barrier() testing. + + rcutorture.nfakewriters= [KNL,BOOT] + Set number of concurrent RCU writers. These just + stress RCU, they don't participate in the actual + test, hence the "fake". + + rcutorture.nreaders= [KNL,BOOT] + Set number of RCU readers. + + rcutorture.onoff_holdoff= [KNL,BOOT] + Set time (s) after boot for CPU-hotplug testing. + + rcutorture.onoff_interval= [KNL,BOOT] + Set time (s) between CPU-hotplug operations, or + zero to disable CPU-hotplug testing. + + rcutorture.shuffle_interval= [KNL,BOOT] + Set task-shuffle interval (s). Shuffling tasks + allows some CPUs to go into dyntick-idle mode + during the rcutorture test. + + rcutorture.shutdown_secs= [KNL,BOOT] + Set time (s) after boot system shutdown. This + is useful for hands-off automated testing. + + rcutorture.stall_cpu= [KNL,BOOT] + Duration of CPU stall (s) to test RCU CPU stall + warnings, zero to disable. + + rcutorture.stall_cpu_holdoff= [KNL,BOOT] + Time to wait (s) after boot before inducing stall. + + rcutorture.stat_interval= [KNL,BOOT] + Time (s) between statistics printk()s. + + rcutorture.stutter= [KNL,BOOT] + Time (s) to stutter testing, for example, specifying + five seconds causes the test to run for five seconds, + wait for five seconds, and so on. This tests RCU's + ability to transition abruptly to and from idle. + + rcutorture.test_boost= [KNL,BOOT] + Test RCU priority boosting? 0=no, 1=maybe, 2=yes. + "Maybe" means test if the RCU implementation + under test support RCU priority boosting. + + rcutorture.test_boost_duration= [KNL,BOOT] + Duration (s) of each individual boost test. + + rcutorture.test_boost_interval= [KNL,BOOT] + Interval (s) between each boost test. + + rcutorture.test_no_idle_hz= [KNL,BOOT] + Test RCU's dyntick-idle handling. See also the + rcutorture.shuffle_interval parameter. + + rcutorture.torture_type= [KNL,BOOT] + Specify the RCU implementation to test. + + rcutorture.verbose= [KNL,BOOT] + Enable additional printk() statements. + rdinit= [KNL] Format: Run specified binary instead of /init from the ramdisk, -- cgit v1.2.3 From 6d8133919bac4270883b24328500875a49e71b36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Feb 2012 13:30:16 -0800 Subject: rcu: Document why rcu_blocking_is_gp() is safe The rcu_blocking_is_gp() function tests to see if there is only one online CPU, and if so, synchronize_sched() and friends become no-ops. However, for larger systems, num_online_cpus() scans a large vector, and might be preempted while doing so. While preempted, any number of CPUs might come online and go offline, potentially resulting in num_online_cpus() returning 1 when there never had only been one CPU online. This could result in a too-short RCU grace period, which could in turn result in total failure, except that the only way that the grace period is too short is if there is an RCU read-side critical section spanning it. For RCU-sched and RCU-bh (which are the only cases using rcu_blocking_is_gp()), RCU read-side critical sections have either preemption or bh disabled, which prevents CPUs from going offline. This in turn prevents actual failures from occurring. This commit therefore adds a large block comment to rcu_blocking_is_gp() documenting why it is safe. This commit also moves rcu_blocking_is_gp() into kernel/rcutree.c, which should help prevent unwary developers from mistaking it for a generally useful function. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/rcutree.h | 7 ------- kernel/rcutree.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index e8ee5dd0854c..b06363055ef8 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -98,13 +98,6 @@ extern void rcu_force_quiescent_state(void); extern void rcu_bh_force_quiescent_state(void); extern void rcu_sched_force_quiescent_state(void); -/* A context switch is a grace period for RCU-sched and RCU-bh. */ -static inline int rcu_blocking_is_gp(void) -{ - might_sleep(); /* Check for RCU read-side critical section. */ - return num_online_cpus() == 1; -} - extern void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 780acf8e15e9..8f6a344306e6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1894,6 +1894,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); +/* + * Because a context switch is a grace period for RCU-sched and RCU-bh, + * any blocking grace-period wait automatically implies a grace period + * if there is only one CPU online at any point time during execution + * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds + * some overhead: RCU still operates correctly. + * + * Of course, sampling num_online_cpus() with preemption enabled can + * give erroneous results if there are concurrent CPU-hotplug operations. + * For example, given a demonic sequence of preemptions in num_online_cpus() + * and CPU-hotplug operations, there could be two or more CPUs online at + * all times, but num_online_cpus() might well return one (or even zero). + * + * However, all such demonic sequences require at least one CPU-offline + * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer + * is only a problem if there is an RCU read-side critical section executing + * throughout. But RCU-sched and RCU-bh read-side critical sections + * disable either preemption or bh, which prevents a CPU from going offline. + * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return + * that there is only one CPU when in fact there was more than one throughout + * is when there were no RCU readers in the system. If there are no + * RCU readers, the grace period by definition can be of zero length, + * regardless of the number of online CPUs. + */ +static inline int rcu_blocking_is_gp(void) +{ + might_sleep(); /* Check for RCU read-side critical section. */ + return num_online_cpus() <= 1; +} + /** * synchronize_sched - wait until an rcu-sched grace period has elapsed. * -- cgit v1.2.3 From 2fdbb31b662787f78bb78b3e4e18f1a072058ffc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Feb 2012 15:58:29 -0800 Subject: rcu: Add RCU_FAST_NO_HZ tracing for idle exit Traces of rcu_prep_idle events can be confusing because rcu_cleanup_after_idle() does no tracing. This commit therefore adds this tracing. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 + kernel/rcutree_plugin.h | 1 + 2 files changed, 2 insertions(+) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 337099783f37..aaa55e1b8c48 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -292,6 +292,7 @@ TRACE_EVENT(rcu_dyntick, * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! * "Timer": Timer fired to cause CPU to continue processing callbacks. + * "Cleanup after idle": Idle exited, timer canceled. */ TRACE_EVENT(rcu_prep_idle, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464816be..1e561ab952a3 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2085,6 +2085,7 @@ static void rcu_prepare_for_idle_init(int cpu) static void rcu_cleanup_after_idle(int cpu) { hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); + trace_rcu_prep_idle("Cleanup after idle"); } /* -- cgit v1.2.3 From 2ee3dc80660ac8285a37e662fd91b2e45c46f06a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Feb 2012 17:13:19 -0800 Subject: rcu: Make RCU_FAST_NO_HZ use timer rather than hrtimer The RCU_FAST_NO_HZ facility uses an hrtimer to wake up a CPU when it is allowed to go into dyntick-idle mode, which is almost always cancelled soon after. This is not what hrtimers are good at, so this commit switches to the timer wheel. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 1e561ab952a3..0f007b363dba 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1980,9 +1980,7 @@ static void rcu_prepare_for_idle(int cpu) static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); -static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ -static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ +static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -2051,10 +2049,9 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) * real work is done upon re-entry to idle, or by the next scheduling-clock * interrupt should idle not be re-entered. */ -static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) +static void rcu_idle_gp_timer_func(unsigned long unused) { trace_rcu_prep_idle("Timer"); - return HRTIMER_NORESTART; } /* @@ -2062,19 +2059,8 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) */ static void rcu_prepare_for_idle_init(int cpu) { - static int firsttime = 1; - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); - - hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtp->function = rcu_idle_gp_timer_func; - if (firsttime) { - unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); - - rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); - upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); - rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); - firsttime = 0; - } + setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_timer_func, 0); } /* @@ -2084,7 +2070,7 @@ static void rcu_prepare_for_idle_init(int cpu) */ static void rcu_cleanup_after_idle(int cpu) { - hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); + del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); trace_rcu_prep_idle("Cleanup after idle"); } @@ -2141,11 +2127,11 @@ static void rcu_prepare_for_idle(int cpu) per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_wait, HRTIMER_MODE_REL); + mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), + jiffies + RCU_IDLE_GP_DELAY); else - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); + mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), + jiffies + RCU_IDLE_LAZY_GP_DELAY); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ @@ -2193,14 +2179,12 @@ static void rcu_prepare_for_idle(int cpu) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); - sprintf(cp, "drain=%d %c timer=%lld", + sprintf(cp, "drain=%d %c timer=%lu", per_cpu(rcu_dyntick_drain, cpu), per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', - hrtimer_active(hrtp) - ? ktime_to_us(hrtimer_get_remaining(hrtp)) - : -1); + timer_pending(tltp) ? tltp->expires - jiffies : -1); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -- cgit v1.2.3 From c57afe80db4e169135eb675acc2d241e26cc064e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Feb 2012 11:02:21 -0800 Subject: rcu: Make RCU_FAST_NO_HZ account for pauses out of idle Both Steven Rostedt's new idle-capable trace macros and the RCU_NONIDLE() macro can cause RCU to momentarily pause out of idle without the rest of the system being involved. This can cause rcu_prepare_for_idle() to run through its state machine too quickly, which can in turn result in needless scheduling-clock interrupts. This commit therefore adds code to enable rcu_prepare_for_idle() to distinguish between an initial entry to idle on the one hand (which needs to advance the rcu_prepare_for_idle() state machine) and an idle reentry due to idle-capable trace macros and RCU_NONIDLE() on the other hand (which should avoid advancing the rcu_prepare_for_idle() state machine). Additional state is maintained to allow the timer to be correctly reposted when returning after a momentary pause out of idle, and even more state is maintained to detect when new non-lazy callbacks have been enqueued (which may require re-evaluation of the approach to idleness). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 ++ kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 57 +++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 56 insertions(+), 4 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1050d6d3922c..403306b86e78 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1829,6 +1829,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp->qlen++; if (lazy) rdp->qlen_lazy++; + else + rcu_idle_count_callbacks_posted(); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a4072..36ca28ecedc6 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -471,6 +471,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); +static void rcu_idle_count_callbacks_posted(void); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0f007b363dba..50c17975d4f4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1938,6 +1938,14 @@ static void rcu_prepare_for_idle(int cpu) { } +/* + * Don't bother keeping a running count of the number of RCU callbacks + * posted because CONFIG_RCU_FAST_NO_HZ=n. + */ +static void rcu_idle_count_callbacks_posted(void) +{ +} + #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ /* @@ -1981,6 +1989,10 @@ static void rcu_prepare_for_idle(int cpu) static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); +static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); +static DEFINE_PER_CPU(bool, rcu_idle_first_pass); +static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); +static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -1993,6 +2005,8 @@ static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); */ int rcu_needs_cpu(int cpu) { + /* Flag a new idle sojourn to the idle-entry state machine. */ + per_cpu(rcu_idle_first_pass, cpu) = 1; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(cpu)) return 0; @@ -2095,6 +2109,26 @@ static void rcu_cleanup_after_idle(int cpu) */ static void rcu_prepare_for_idle(int cpu) { + /* + * If this is an idle re-entry, for example, due to use of + * RCU_NONIDLE() or the new idle-loop tracing API within the idle + * loop, then don't take any state-machine actions, unless the + * momentary exit from idle queued additional non-lazy callbacks. + * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks + * pending. + */ + if (!per_cpu(rcu_idle_first_pass, cpu) && + (per_cpu(rcu_nonlazy_posted, cpu) == + per_cpu(rcu_nonlazy_posted_snap, cpu))) { + if (rcu_cpu_has_callbacks(cpu)) + mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), + per_cpu(rcu_idle_gp_timer_expires, cpu)); + return; + } + per_cpu(rcu_idle_first_pass, cpu) = 0; + per_cpu(rcu_nonlazy_posted_snap, cpu) = + per_cpu(rcu_nonlazy_posted, cpu) - 1; + /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. @@ -2127,11 +2161,15 @@ static void rcu_prepare_for_idle(int cpu) per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) - mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), - jiffies + RCU_IDLE_GP_DELAY); + per_cpu(rcu_idle_gp_timer_expires, cpu) = + jiffies + RCU_IDLE_GP_DELAY; else - mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), - jiffies + RCU_IDLE_LAZY_GP_DELAY); + per_cpu(rcu_idle_gp_timer_expires, cpu) = + jiffies + RCU_IDLE_LAZY_GP_DELAY; + mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), + per_cpu(rcu_idle_gp_timer_expires, cpu)); + per_cpu(rcu_nonlazy_posted_snap, cpu) = + per_cpu(rcu_nonlazy_posted, cpu); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ @@ -2171,6 +2209,17 @@ static void rcu_prepare_for_idle(int cpu) trace_rcu_prep_idle("Callbacks drained"); } +/* + * Keep a running count of callbacks posted so that rcu_prepare_for_idle() + * can detect when something out of the idle loop posts a callback. + * Of course, it had better do so either from a trace event designed to + * be called from idle or from within RCU_NONIDLE(). + */ +static void rcu_idle_count_callbacks_posted(void) +{ + __this_cpu_add(rcu_nonlazy_posted, 1); +} + #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_CPU_STALL_INFO -- cgit v1.2.3 From 37e377d2823e03528cb64f435d7c0e30b1c668eb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Feb 2012 22:12:18 -0800 Subject: rcu: Fixes to rcutorture error handling and cleanup The rcutorture initialization code ignored the error returns from rcu_torture_onoff_init() and rcu_torture_stall_init(). The rcutorture cleanup code failed to NULL out a number of pointers. These bugs will normally have no effect, but this commit fixes them nevertheless. Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a89b381a8c6e..1463a0636443 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1337,6 +1337,7 @@ static void rcutorture_booster_cleanup(int cpu) /* This must be outside of the mutex, otherwise deadlock! */ kthread_stop(t); + boost_tasks[cpu] = NULL; } static int rcutorture_booster_init(int cpu) @@ -1484,13 +1485,15 @@ static void rcu_torture_onoff_cleanup(void) return; VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); kthread_stop(onoff_task); + onoff_task = NULL; } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void +static int rcu_torture_onoff_init(void) { + return 0; } static void rcu_torture_onoff_cleanup(void) @@ -1554,6 +1557,7 @@ static void rcu_torture_stall_cleanup(void) return; VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); kthread_stop(stall_task); + stall_task = NULL; } static int rcutorture_cpu_notify(struct notifier_block *self, @@ -1665,6 +1669,7 @@ rcu_torture_cleanup(void) VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); kthread_stop(shutdown_task); } + shutdown_task = NULL; rcu_torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1897,9 +1902,17 @@ rcu_torture_init(void) goto unwind; } } - rcu_torture_onoff_init(); + i = rcu_torture_onoff_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } register_reboot_notifier(&rcutorture_shutdown_nb); - rcu_torture_stall_init(); + i = rcu_torture_stall_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; -- cgit v1.2.3 From 79b9a75fb703b6a2670e46b9dc495af5bc7029b3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Apr 2012 10:06:39 -0700 Subject: rcu: Add warning for RCU_FAST_NO_HZ timer firing RCU_FAST_NO_HZ uses a timer to limit the time that a CPU with callbacks can remain in dyntick-idle mode. This timer is cancelled when the CPU exits idle, and therefore should never fire. However, if the timer were migrated to some other CPU for whatever reason (1) the timer could actually fire and (2) firing on some other CPU would fail to wake up the CPU with callbacks, possibly resulting in sluggishness or a system hang. This commit therfore adds a WARN_ON_ONCE() to the timer handler in order to detect this condition. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 50c17975d4f4..ad61da79b311 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2065,6 +2065,7 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) */ static void rcu_idle_gp_timer_func(unsigned long unused) { + WARN_ON_ONCE(1); /* Getting here can hang the system... */ trace_rcu_prep_idle("Timer"); } -- cgit v1.2.3 From 048a0e8f5e1d94c01a5fc70f5b2f2fd2f4527326 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Apr 2012 10:52:27 -0700 Subject: timer: Fix mod_timer_pinned() header comment The mod_timer_pinned() header comment states that it prevents timers from being migrated to a different CPU. This is not the case, instead, it ensures that the timer is posted to the current CPU, but does nothing to prevent CPU-hotplug operations from migrating the timer. This commit therefore brings the comment header into alignment with reality. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt --- kernel/timer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/timer.c b/kernel/timer.c index a297ffcf888e..837c552fe838 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer); * * mod_timer_pinned() is a way to update the expire field of an * active timer (if the timer is inactive it will be activated) - * and not allow the timer to be migrated to a different CPU. + * and to ensure that the timer is scheduled on the current CPU. + * + * Note that this does not prevent the timer from being migrated + * when the current CPU goes offline. If this is a problem for + * you, use CPU-hotplug notifiers to handle it correctly, for + * example, cancelling the timer when the corresponding CPU goes + * offline. * * mod_timer_pinned(timer, expires) is equivalent to: * -- cgit v1.2.3 From fae4b54f28f034d228fa3bfc98858c698b64e89c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 20 Feb 2012 17:51:45 -0800 Subject: rcu: Introduce rcutorture testing for rcu_barrier() Although rcutorture does invoke rcu_barrier() and friends, it cannot really be called a torture test given that it invokes them only once at the end of the test. This commit therefore introduces heavy-duty rcutorture testing for rcu_barrier(), which may be carried out concurrently with normal rcutorture testing. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/torture.txt | 15 +++- kernel/rcutorture.c | 194 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 200 insertions(+), 9 deletions(-) diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 375d3fb71437..4ddf3913fd8c 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -47,6 +47,16 @@ irqreader Says to invoke RCU readers from irq level. This is currently permit this. (Or, more accurately, variants of RCU that do -not- permit this know to ignore this variable.) +n_barrier_cbs If this is nonzero, RCU barrier testing will be conducted, + in which case n_barrier_cbs specifies the number of + RCU callbacks (and corresponding kthreads) to use for + this testing. The value cannot be negative. If you + specify this to be non-zero when torture_type indicates a + synchronous RCU implementation (one for which a member of + the synchronize_rcu() rather than the call_rcu() family is + used -- see the documentation for torture_type below), an + error will be reported and no testing will be carried out. + nfakewriters This is the number of RCU fake writer threads to run. Fake writer threads repeatedly use the synchronous "wait for current readers" function of the interface selected by @@ -188,7 +198,7 @@ OUTPUT The statistics output is as follows: rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4 - rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767 + rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767 rcu-torture: Reader Pipe: 727860534 34213 0 0 0 0 0 0 0 0 0 rcu-torture: Reader Batch: 727877838 17003 0 0 0 0 0 0 0 0 0 rcu-torture: Free-Block Circulation: 155440 155440 155440 155440 155440 155440 155440 155440 155440 155440 0 @@ -230,6 +240,9 @@ o "rtmbe": A non-zero value indicates that rcutorture believes that rcu_assign_pointer() and rcu_dereference() are not working correctly. This value should be zero. +o "rtbe": A non-zero value indicates that one of the rcu_barrier() + family of functions is not working correctly. + o "rtbke": rcutorture was unable to create the real-time kthreads used to force RCU priority inversion. This value should be zero. diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 1463a0636443..8cd262b41499 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ static int fqs_holdoff; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ +static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ @@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(n_barrier_cbs, int, 0444); +MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); module_param(onoff_interval, int, 0444); MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); module_param(onoff_holdoff, int, 0444); @@ -139,6 +142,8 @@ static struct task_struct *shutdown_task; static struct task_struct *onoff_task; #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static struct task_struct *stall_task; +static struct task_struct **barrier_cbs_tasks; +static struct task_struct *barrier_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; +static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; @@ -173,6 +179,8 @@ static long n_offline_attempts; static long n_offline_successes; static long n_online_attempts; static long n_online_successes; +static long n_barrier_attempts; +static long n_barrier_successes; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; @@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ +static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ +static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ +static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ +static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ @@ -327,6 +339,7 @@ struct rcu_torture_ops { int (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); + void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); int (*stats)(char *page); @@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = { .completed = rcu_torture_completed, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, + .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, @@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .completed = rcu_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_force_quiescent_state, .stats = NULL, @@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_expedited, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_force_quiescent_state, .stats = NULL, @@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .completed = rcu_bh_torture_completed, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, + .call = call_rcu_bh, .cb_barrier = rcu_barrier_bh, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .completed = rcu_bh_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_bh, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { .completed = rcu_bh_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_bh_expedited, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -637,6 +656,7 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize, + .call = NULL, .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu" @@ -661,6 +681,7 @@ static struct rcu_torture_ops srcu_raw_ops = { .completed = srcu_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize, + .call = NULL, .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu_raw" @@ -680,6 +701,7 @@ static struct rcu_torture_ops srcu_expedited_ops = { .completed = srcu_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize_expedited, + .call = NULL, .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu_expedited" @@ -1129,7 +1151,8 @@ rcu_torture_printk(char *page) "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d rtbke: %ld rtbre: %ld " "rtbf: %ld rtb: %ld nt: %ld " - "onoff: %ld/%ld:%ld/%ld", + "onoff: %ld/%ld:%ld/%ld " + "barrier: %ld/%ld:%ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), @@ -1145,14 +1168,17 @@ rcu_torture_printk(char *page) n_online_successes, n_online_attempts, n_offline_successes, - n_offline_attempts); + n_offline_attempts, + n_barrier_successes, + n_barrier_attempts, + n_rcu_torture_barrier_error); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || + n_rcu_torture_barrier_error != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_failure != 0) - cnt += sprintf(&page[cnt], " !!!"); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - if (i > 1) { + n_rcu_torture_boost_failure != 0 || + i > 1) { cnt += sprintf(&page[cnt], "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(1); @@ -1560,6 +1586,151 @@ static void rcu_torture_stall_cleanup(void) stall_task = NULL; } +/* Callback function for RCU barrier testing. */ +void rcu_torture_barrier_cbf(struct rcu_head *rcu) +{ + atomic_inc(&barrier_cbs_invoked); +} + +/* kthread function to register callbacks used to test RCU barriers. */ +static int rcu_torture_barrier_cbs(void *arg) +{ + long myid = (long)arg; + struct rcu_head rcu; + + init_rcu_head_on_stack(&rcu); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); + set_user_nice(current, 19); + do { + wait_event(barrier_cbs_wq[myid], + atomic_read(&barrier_cbs_count) == n_barrier_cbs || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + if (atomic_dec_and_test(&barrier_cbs_count)) + wake_up(&barrier_wq); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + destroy_rcu_head_on_stack(&rcu); + return 0; +} + +/* kthread function to drive and coordinate RCU barrier testing. */ +static int rcu_torture_barrier(void *arg) +{ + int i; + + VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); + do { + atomic_set(&barrier_cbs_invoked, 0); + atomic_set(&barrier_cbs_count, n_barrier_cbs); + /* wake_up() path contains the required barriers. */ + for (i = 0; i < n_barrier_cbs; i++) + wake_up(&barrier_cbs_wq[i]); + wait_event(barrier_wq, + atomic_read(&barrier_cbs_count) == 0 || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + n_barrier_attempts++; + cur_ops->cb_barrier(); + if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { + n_rcu_torture_barrier_error++; + WARN_ON_ONCE(1); + } + n_barrier_successes++; + schedule_timeout_interruptible(HZ / 10); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + return 0; +} + +/* Initialize RCU barrier testing. */ +static int rcu_torture_barrier_init(void) +{ + int i; + int ret; + + if (n_barrier_cbs == 0) + return 0; + if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { + printk(KERN_ALERT "%s" TORTURE_FLAG + " Call or barrier ops missing for %s,\n", + torture_type, cur_ops->name); + printk(KERN_ALERT "%s" TORTURE_FLAG + " RCU barrier testing omitted from run.\n", + torture_type); + return 0; + } + atomic_set(&barrier_cbs_count, 0); + atomic_set(&barrier_cbs_invoked, 0); + barrier_cbs_tasks = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + GFP_KERNEL); + barrier_cbs_wq = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), + GFP_KERNEL); + if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) + return -ENOMEM; + for (i = 0; i < n_barrier_cbs; i++) { + init_waitqueue_head(&barrier_cbs_wq[i]); + barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, + (void *)i, + "rcu_torture_barrier_cbs"); + if (IS_ERR(barrier_cbs_tasks[i])) { + ret = PTR_ERR(barrier_cbs_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); + barrier_cbs_tasks[i] = NULL; + return ret; + } + } + barrier_task = kthread_run(rcu_torture_barrier, NULL, + "rcu_torture_barrier"); + if (IS_ERR(barrier_task)) { + ret = PTR_ERR(barrier_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); + barrier_task = NULL; + } + return 0; +} + +/* Clean up after RCU barrier testing. */ +static void rcu_torture_barrier_cleanup(void) +{ + int i; + + if (barrier_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); + kthread_stop(barrier_task); + barrier_task = NULL; + } + if (barrier_cbs_tasks != NULL) { + for (i = 0; i < n_barrier_cbs; i++) { + if (barrier_cbs_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); + kthread_stop(barrier_cbs_tasks[i]); + barrier_cbs_tasks[i] = NULL; + } + } + kfree(barrier_cbs_tasks); + barrier_cbs_tasks = NULL; + } + if (barrier_cbs_wq != NULL) { + kfree(barrier_cbs_wq); + barrier_cbs_wq = NULL; + } +} + static int rcutorture_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1602,6 +1773,7 @@ rcu_torture_cleanup(void) fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_shutdown_nb); + rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); if (stutter_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); @@ -1681,7 +1853,7 @@ rcu_torture_cleanup(void) if (cur_ops->cleanup) cur_ops->cleanup(); - if (atomic_read(&n_rcu_torture_error)) + if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); else if (n_online_successes != n_online_attempts || n_offline_successes != n_offline_attempts) @@ -1697,6 +1869,7 @@ rcu_torture_init(void) int i; int cpu; int firsterr = 0; + int retval; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, @@ -1754,6 +1927,7 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_free, 0); atomic_set(&n_rcu_torture_mberror, 0); atomic_set(&n_rcu_torture_error, 0); + n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; n_rcu_torture_boost_rterror = 0; n_rcu_torture_boost_failure = 0; @@ -1877,7 +2051,6 @@ rcu_torture_init(void) test_boost_duration = 2; if ((test_boost == 1 && cur_ops->can_boost) || test_boost == 2) { - int retval; boost_starttime = jiffies + test_boost_interval * HZ; register_cpu_notifier(&rcutorture_cpu_nb); @@ -1913,6 +2086,11 @@ rcu_torture_init(void) firsterr = i; goto unwind; } + retval = rcu_torture_barrier_init(); + if (retval != 0) { + firsterr = retval; + goto unwind; + } rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; -- cgit v1.2.3 From cef50120b61c2af4ce34bc165e19cad66296f93d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 5 Feb 2012 07:42:44 -0800 Subject: rcu: Direct algorithmic SRCU implementation The current implementation of synchronize_srcu_expedited() can cause severe OS jitter due to its use of synchronize_sched(), which in turn invokes try_stop_cpus(), which causes each CPU to be sent an IPI. This can result in severe performance degradation for real-time workloads and especially for short-interation-length HPC workloads. Furthermore, because only one instance of try_stop_cpus() can be making forward progress at a given time, only one instance of synchronize_srcu_expedited() can make forward progress at a time, even if they are all operating on distinct srcu_struct structures. This commit, inspired by an earlier implementation by Peter Zijlstra (https://lkml.org/lkml/2012/1/31/211) and by further offline discussions, takes a strictly algorithmic bits-in-memory approach. This has the disadvantage of requiring one explicit memory-barrier instruction in each of srcu_read_lock() and srcu_read_unlock(), but on the other hand completely dispenses with OS jitter and furthermore allows SRCU to be used freely by CPUs that RCU believes to be idle or offline. The update-side implementation handles the single read-side memory barrier by rechecking the per-CPU counters after summing them and by running through the update-side state machine twice. This implementation has passed moderate rcutorture testing on both x86 and Power. Also updated to use this_cpu_ptr() instead of per_cpu_ptr(), as suggested by Peter Zijlstra. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra Reviewed-by: Lai Jiangshan --- include/linux/srcu.h | 10 +- kernel/rcutorture.c | 2 +- kernel/srcu.c | 284 ++++++++++++++++++++++++++++++++++----------------- 3 files changed, 198 insertions(+), 98 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index d3d5fa54f25e..a478c8eb8479 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -31,13 +31,19 @@ #include struct srcu_struct_array { - int c[2]; + unsigned long c[2]; }; +/* Bit definitions for field ->c above and ->snap below. */ +#define SRCU_USAGE_BITS 2 +#define SRCU_REF_MASK (ULONG_MAX >> SRCU_USAGE_BITS) +#define SRCU_USAGE_COUNT (SRCU_REF_MASK + 1) + struct srcu_struct { - int completed; + unsigned completed; struct srcu_struct_array __percpu *per_cpu_ref; struct mutex mutex; + unsigned long snap[NR_CPUS]; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 8cd262b41499..d10b179dea83 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -639,7 +639,7 @@ static int srcu_torture_stats(char *page) cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, + cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); } diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f4..84c9b97dc3d9 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -73,19 +73,102 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* - * srcu_readers_active_idx -- returns approximate number of readers - * active on the specified rank of per-CPU counters. + * Returns approximate number of readers active on the specified rank + * of per-CPU counters. Also snapshots each counter's value in the + * corresponding element of sp->snap[] for later use validating + * the sum. */ +static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + unsigned long t; -static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) + for_each_possible_cpu(cpu) { + t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); + sum += t; + sp->snap[cpu] = t; + } + return sum & SRCU_REF_MASK; +} + +/* + * To be called from the update side after an index flip. Returns true + * if the modulo sum of the counters is stably zero, false if there is + * some possibility of non-zero. + */ +static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) { int cpu; - int sum; - sum = 0; + /* + * Note that srcu_readers_active_idx() can incorrectly return + * zero even though there is a pre-existing reader throughout. + * To see this, suppose that task A is in a very long SRCU + * read-side critical section that started on CPU 0, and that + * no other reader exists, so that the modulo sum of the counters + * is equal to one. Then suppose that task B starts executing + * srcu_readers_active_idx(), summing up to CPU 1, and then that + * task C starts reading on CPU 0, so that its increment is not + * summed, but finishes reading on CPU 2, so that its decrement + * -is- summed. Then when task B completes its sum, it will + * incorrectly get zero, despite the fact that task A has been + * in its SRCU read-side critical section the whole time. + * + * We therefore do a validation step should srcu_readers_active_idx() + * return zero. + */ + if (srcu_readers_active_idx(sp, idx) != 0) + return false; + + /* + * Since the caller recently flipped ->completed, we can see at + * most one increment of each CPU's counter from this point + * forward. The reason for this is that the reader CPU must have + * fetched the index before srcu_readers_active_idx checked + * that CPU's counter, but not yet incremented its counter. + * Its eventual counter increment will follow the read in + * srcu_readers_active_idx(), and that increment is immediately + * followed by smp_mb() B. Because smp_mb() D is between + * the ->completed flip and srcu_readers_active_idx()'s read, + * that CPU's subsequent load of ->completed must see the new + * value, and therefore increment the counter in the other rank. + */ + smp_mb(); /* A */ + + /* + * Now, we check the ->snap array that srcu_readers_