From b934b7ff5ea755473d9737d79ab303722ceba22e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 12 Jul 2023 23:15:56 +0800 Subject: [PATCH 01/56] rcu: Delete a redundant check in rcu_check_gp_kthread_starvation() The rcu_check_gp_kthread_starvation() function uses task_cpu() to sample the last CPU that the grace-period kthread ran on, and task_cpu() samples the thread_info structure's ->cpu field. But this field will always contain a number corresponding to a CPU that was online some time in the past, thus never a negative number. This invariant is checked by a WARN_ON_ONCE() in set_task_cpu(). This means that if the grace-period kthread exists, that is, if the "gpk" local variable is non-NULL, the "cpu" local variable will be non-negative. This in turn means that the existing check for non-negative "cpu" is redundant with the enclosing check for non-NULL "gpk". This commit threefore removes the redundant check of "cpu". Signed-off-by: Zhen Lei Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree_stall.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6f06dc12904a..b5ce0580074e 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -537,13 +537,11 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); - if (cpu >= 0) { - if (cpu_is_offline(cpu)) { - pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); - } else { - pr_err("Stack dump where RCU GP kthread last ran:\n"); - dump_cpu_task(cpu); - } + if (cpu_is_offline(cpu)) { + pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); + } else { + pr_err("Stack dump where RCU GP kthread last ran:\n"); + dump_cpu_task(cpu); } wake_up_process(gpk); } From f3efe02fd56e2498ad8da4c4f444a34aa1456a46 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 12 Jul 2023 23:15:57 +0800 Subject: [PATCH 02/56] rcu: Don't redump the stalled CPU where RCU GP kthread last ran The stacks of all stalled CPUs will be dumped in rcu_dump_cpu_stacks(). If the CPU on where RCU GP kthread last ran is stalled, its stack does not need to be dumped again. We can search the corresponding backtrace based on the printed CPU ID. For example: [ 87.328275] rcu: rcu_sched kthread starved for ... ->cpu=3 <--------| ... ... | [ 89.385007] NMI backtrace for cpu 3 <--------| [ 89.385179] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.10.0+ #22 <--| [ 89.385188] Hardware name: linux,dummy-virt (DT) [ 89.385196] pstate: 60000005 (nZCv daif -PAN -UAO -TCO BTYPE=--) [ 89.385204] pc : arch_cpu_idle+0x40/0xc0 [ 89.385211] lr : arch_cpu_idle+0x2c/0xc0 ... ... [ 89.385566] Call trace: [ 89.385574] arch_cpu_idle+0x40/0xc0 [ 89.385581] default_idle_call+0x100/0x450 [ 89.385589] cpuidle_idle_call+0x2f8/0x460 [ 89.385596] do_idle+0x1dc/0x3d0 [ 89.385604] cpu_startup_entry+0x5c/0xb0 [ 89.385613] secondary_start_kernel+0x35c/0x520 Signed-off-by: Zhen Lei Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree_stall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b5ce0580074e..fc04a8d7ce96 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -534,12 +534,14 @@ static void rcu_check_gp_kthread_starvation(void) data_race(READ_ONCE(rcu_state.gp_state)), gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu); if (gpk) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); if (cpu_is_offline(cpu)) { pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); - } else { + } else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) { pr_err("Stack dump where RCU GP kthread last ran:\n"); dump_cpu_task(cpu); } From 243d5ab34446854ceca55146fc0d837655657f8e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 24 Jul 2023 10:26:51 +0800 Subject: [PATCH 03/56] rcu: Eliminate check_cpu_stall() duplicate code The code and comments of self-detected and other-detected RCU CPU stall warnings are identical except the output function. This commit therefore refactors so as to consolidate the duplicate code. Signed-off-by: Zhen Lei Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree_stall.h | 42 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index fc04a8d7ce96..2443d1d4a6dc 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -711,7 +711,7 @@ static void print_cpu_stall(unsigned long gps) static void check_cpu_stall(struct rcu_data *rdp) { - bool didstall = false; + bool self_detected; unsigned long gs1; unsigned long gs2; unsigned long gps; @@ -758,10 +758,10 @@ static void check_cpu_stall(struct rcu_data *rdp) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + ULONG_MAX / 2; + self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask; if (rcu_gp_in_progress() && - (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - /* * If a virtual machine is stopped by the host it can look to * the watchdog like an RCU stall. Check to see if the host @@ -770,33 +770,21 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(gps); + if (self_detected) { + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(gps); + } else { + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(gs2, gps); + } + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); - didstall = true; - } else if (rcu_gp_in_progress() && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* - * If a virtual machine is stopped by the host it can look to - * the watchdog like an RCU stall. Check to see if the host - * stopped the vm. - */ - if (kvm_check_and_clear_guest_paused()) - return; - - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2, gps); - if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) - rcu_ftrace_dump(DUMP_ALL); - didstall = true; - } - if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) { - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - WRITE_ONCE(rcu_state.jiffies_stall, jn); + if (READ_ONCE(rcu_state.jiffies_stall) == jn) { + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcu_state.jiffies_stall, jn); + } } } From 5b404fdabacf4bee92d8c66013402a85f18a6a10 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Aug 2023 15:46:32 -0700 Subject: [PATCH 04/56] rcu: Add RCU CPU stall notifier It is sometimes helpful to have a way for the subsystem causing the stall to dump its state when an RCU CPU stall occurs. This commit therefore bases rcu_stall_chain_notifier_register() and rcu_stall_chain_notifier_unregister() on atomic notifiers in order to provide this functionality. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/linux/rcu_notifier.h | 32 +++++++++++++++++++ kernel/rcu/rcu.h | 6 ++++ kernel/rcu/tree_exp.h | 6 +++- kernel/rcu/tree_stall.h | 59 +++++++++++++++++++++++++++++++++++- 4 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 include/linux/rcu_notifier.h diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h new file mode 100644 index 000000000000..ebf371364581 --- /dev/null +++ b/include/linux/rcu_notifier.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update notifiers, initially RCU CPU stall notifier. + * Separate from rcupdate.h to avoid #include loops. + * + * Copyright (C) 2023 Paul E. McKenney. + */ + +#ifndef __LINUX_RCU_NOTIFIER_H +#define __LINUX_RCU_NOTIFIER_H + +// Actions for RCU CPU stall notifier calls. +#define RCU_STALL_NOTIFY_NORM 1 +#define RCU_STALL_NOTIFY_EXP 2 + +#ifdef CONFIG_RCU_STALL_COMMON + +#include +#include + +int rcu_stall_chain_notifier_register(struct notifier_block *n); +int rcu_stall_chain_notifier_unregister(struct notifier_block *n); + +#else // #ifdef CONFIG_RCU_STALL_COMMON + +// No RCU CPU stall warnings in Tiny RCU. +static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; } +static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; } + +#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON + +#endif /* __LINUX_RCU_NOTIFIER_H */ diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 98e13be411af..ef3bab977407 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -654,4 +654,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } bool rcu_cpu_beenfullyonline(int cpu); #endif +#ifdef CONFIG_RCU_STALL_COMMON +int rcu_stall_notifier_call_chain(unsigned long val, void *v); +#else // #ifdef CONFIG_RCU_STALL_COMMON +static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; } +#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8239b39d945b..6d7cea5d591f 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -621,10 +621,14 @@ static void synchronize_rcu_expedited_wait(void) } for (;;) { + unsigned long j; + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; if (rcu_stall_is_suppressed()) continue; + j = jiffies; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); @@ -647,7 +651,7 @@ static void synchronize_rcu_expedited_wait(void) } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rcu_state.expedited_sequence, + j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 2443d1d4a6dc..49544f932279 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -8,6 +8,7 @@ */ #include +#include ////////////////////////////////////////////////////////////////////////////// // @@ -770,6 +771,7 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); if (self_detected) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); @@ -790,7 +792,7 @@ static void check_cpu_stall(struct rcu_data *rdp) ////////////////////////////////////////////////////////////////////////////// // -// RCU forward-progress mechanisms, including of callback invocation. +// RCU forward-progress mechanisms, including for callback invocation. /* @@ -1042,3 +1044,58 @@ static int __init rcu_sysrq_init(void) return 0; } early_initcall(rcu_sysrq_init); + + +////////////////////////////////////////////////////////////////////////////// +// +// RCU CPU stall-warning notifiers + +static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list); + +/** + * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier + * @n: Entry to add. + * + * Adds an RCU CPU stall notifier to an atomic notifier chain. + * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or + * friends. The @data will be the duration of the stalled grace period, + * in jiffies, coerced to a void* pointer. + * + * Returns 0 on success, %-EEXIST on error. + */ +int rcu_stall_chain_notifier_register(struct notifier_block *n) +{ + return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n); +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register); + +/** + * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier + * @n: Entry to add. + * + * Removes an RCU CPU stall notifier from an atomic notifier chain. + * + * Returns zero on success, %-ENOENT on failure. + */ +int rcu_stall_chain_notifier_unregister(struct notifier_block *n) +{ + return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n); +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister); + +/* + * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in the RCU CPU stall notifier chain in turn, which + * is an atomic call chain. See atomic_notifier_call_chain() for more + * information. + * + * This is for use within RCU, hence the omission of the extra asterisk + * to indicate a non-kerneldoc format header comment. + */ +int rcu_stall_notifier_call_chain(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v); +} From 7c1b3e0c988f2902695ef6175ab8ad00c0e8b65f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Aug 2023 15:53:32 -0700 Subject: [PATCH 05/56] rcutorture: Add test of RCU CPU stall notifiers This commit registers an RCU CPU stall notifier when testing RCU CPU stalls. The notifier logs a message similar to the following: rcu_torture_stall_nf: v=1, duration=21001. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcutorture.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ade42d6a9d9b..1830cacac01d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -2428,6 +2429,16 @@ static int rcutorture_booster_init(unsigned int cpu) return 0; } +static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr) +{ + pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr); + return NOTIFY_OK; +} + +static struct notifier_block rcu_torture_stall_block = { + .notifier_call = rcu_torture_stall_nf, +}; + /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. @@ -2435,9 +2446,14 @@ static int rcutorture_booster_init(unsigned int cpu) static int rcu_torture_stall(void *args) { int idx; + int ret; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); @@ -2481,6 +2497,11 @@ static int rcu_torture_stall(void *args) cur_ops->readunlock(idx); } pr_alert("%s end.\n", __func__); + if (!ret) { + ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret); + } torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); From b96e7a5fa0ba9cda32888e04f8f4bac42d49a7f8 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 5 Sep 2023 00:02:11 +0000 Subject: [PATCH 06/56] rcu/tree: Defer setting of jiffies during stall reset There are instances where rcu_cpu_stall_reset() is called when jiffies did not get a chance to update for a long time. Before jiffies is updated, the CPU stall detector can go off triggering false-positives where a just-started grace period appears to be ages old. In the past, we disabled stall detection in rcu_cpu_stall_reset() however this got changed [1]. This is resulting in false-positives in KGDB usecase [2]. Fix this by deferring the update of jiffies to the third run of the FQS loop. This is more robust, as, even if rcu_cpu_stall_reset() is called just before jiffies is read, we would end up pushing out the jiffies read by 3 more FQS loops. Meanwhile the CPU stall detection will be delayed and we will not get any false positives. [1] https://lore.kernel.org/all/20210521155624.174524-2-senozhatsky@chromium.org/ [2] https://lore.kernel.org/all/20230814020045.51950-2-chenhuacai@loongson.cn/ Tested with rcutorture.cpu_stall option as well to verify stall behavior with/without patch. Tested-by: Huacai Chen Reported-by: Binbin Zhou Closes: https://lore.kernel.org/all/20230814020045.51950-2-chenhuacai@loongson.cn/ Suggested-by: Paul McKenney Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: a80be428fbc1 ("rcu: Do not disable GP stall detection in rcu_cpu_stall_reset()") Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 12 ++++++++++++ kernel/rcu/tree.h | 4 ++++ kernel/rcu/tree_stall.h | 20 ++++++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb1caefa8bd0..d85779f67aea 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1556,10 +1556,22 @@ static bool rcu_gp_fqs_check_wake(int *gfp) */ static void rcu_gp_fqs(bool first_time) { + int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall); struct rcu_node *rnp = rcu_get_root(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1); + + WARN_ON_ONCE(nr_fqs > 3); + /* Only countdown nr_fqs for stall purposes if jiffies moves. */ + if (nr_fqs) { + if (nr_fqs == 1) { + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); + } + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs); + } + if (first_time) { /* Collect dyntick-idle snapshots. */ force_qs_rnp(dyntick_save_progress_counter); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 192536916f9a..e9821a8422db 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -386,6 +386,10 @@ struct rcu_state { /* in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ + int nr_fqs_jiffies_stall; /* Number of fqs loops after + * which read jiffies and set + * jiffies_stall. Stall + * warnings disabled if !0. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 49544f932279..ac8e86babe44 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -150,12 +150,17 @@ static void panic_on_rcu_stall(void) /** * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period * + * To perform the reset request from the caller, disable stall detection until + * 3 fqs loops have passed. This is required to ensure a fresh jiffies is + * loaded. It should be safe to do from the fqs loop as enough timer + * interrupts and context switches should have passed. + * * The caller must disable hard irqs. */ void rcu_cpu_stall_reset(void) { - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3); + WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX); } ////////////////////////////////////////////////////////////////////////////// @@ -171,6 +176,7 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0); WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); @@ -726,6 +732,16 @@ static void check_cpu_stall(struct rcu_data *rdp) !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); + + /* + * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS + * loop has to set jiffies to ensure a non-stale jiffies value. This + * is required to have good jiffies value after coming out of long + * breaks of jiffies updates. Not doing so can cause false positives. + */ + if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0) + return; + j = jiffies; /* From 92a708dc1fb8c33b0017ad77dc7ff6e434f96ee2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jul 2023 13:13:46 -0700 Subject: [PATCH 07/56] rcu-tasks: Add printk()s to localize boot-time self-test hang Currently, rcu_tasks_initiate_self_tests() prints a message and then initiates self tests on up to three different RCU Tasks flavors. If one of the flavors has a grace-period hang, it is not easy to work out which of the three hung. This commit therefore prints a message prior to each individual test. Reported-by: Guenter Roeck Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tasks.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8d65f7d576a3..83049a893de5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1979,20 +1979,22 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) static void rcu_tasks_initiate_self_tests(void) { - pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU + pr_info("Running RCU Tasks wait API self tests\n"); tests[0].runstart = jiffies; synchronize_rcu_tasks(); call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_RUDE_RCU + pr_info("Running RCU Tasks Rude wait API self tests\n"); tests[1].runstart = jiffies; synchronize_rcu_tasks_rude(); call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU + pr_info("Running RCU Tasks Trace wait API self tests\n"); tests[2].runstart = jiffies; synchronize_rcu_tasks_trace(); call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); From e62d8ae4620865411d1b2347980aa28ccf891a3d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Aug 2023 13:42:00 -0700 Subject: [PATCH 08/56] rcu-tasks: Pull sampling of ->percpu_dequeue_lim out of loop The rcu_tasks_need_gpcb() samples ->percpu_dequeue_lim as part of the condition clause of a "for" loop, which is a bit confusing. This commit therefore hoists this sampling out of the loop, using the result loaded in the condition clause. So why does this work in the face of a concurrent switch from single-CPU queueing to per-CPU queueing? o The call_rcu_tasks_generic() that makes the change has already enqueued its callback, which means that all of the other CPU's callback queues are empty. o For the call_rcu_tasks_generic() that first notices the switch to per-CPU queues, the smp_store_release() used to update ->percpu_enqueue_lim pairs with the raw_spin_trylock_rcu_node()'s full barrier that is between the READ_ONCE(rtp->percpu_enqueue_shift) and the rcu_segcblist_enqueue() that enqueues the callback. o Because this CPU's queue is empty (unless it happens to be the original single queue, in which case there is no need for synchronization), this call_rcu_tasks_generic() will do an irq_work_queue() to schedule a handler for the needed rcuwait_wake_up() call. This call will be ordered after the first call_rcu_tasks_generic() function's change to ->percpu_dequeue_lim. o This rcuwait_wake_up() will either happen before or after the set_current_state() in rcuwait_wait_event(). If it happens before, the "condition" argument's call to rcu_tasks_need_gpcb() will be ordered after the original change, and all callbacks on all CPUs will be visible. Otherwise, if it happens after, then the grace-period kthread's state will be set back to running, which will result in a later call to rcuwait_wait_event() and thus to rcu_tasks_need_gpcb(), which will again see the change. So it all works out. Suggested-by: Linus Torvalds Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tasks.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 83049a893de5..94bb5abdbb37 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -432,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) { int cpu; + int dequeue_limit; unsigned long flags; bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq); long n; @@ -439,7 +440,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) long ncbsnz = 0; int needgpcb = 0; - for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) { + dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim); + for (cpu = 0; cpu < dequeue_limit; cpu++) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ From 0325e8a1282dfd30c3e44928c6384bb978649c63 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 3 Aug 2023 16:06:59 +0800 Subject: [PATCH 09/56] rcu-tasks: Make rcu_tasks_lazy_ms static The rcu_tasks_lazy_ms variable is not used outside the file tasks.h, so this commit marks it static. kernel/rcu/tasks.h:1085:5: warning: symbol 'rcu_tasks_lazy_ms' was not declared. Should it be static? Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=6086 Signed-off-by: Jiapeng Chong Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 94bb5abdbb37..018f03f20629 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1086,7 +1086,7 @@ void rcu_barrier_tasks(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); -int rcu_tasks_lazy_ms = -1; +static int rcu_tasks_lazy_ms = -1; module_param(rcu_tasks_lazy_ms, int, 0444); static int __init rcu_spawn_tasks_kthread(void) From 730c3ed4ba30feadfb20b4d4d18e869bc00348f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Aug 2023 09:30:18 -0700 Subject: [PATCH 10/56] refscale: Fix misplaced data re-read This commit fixes a misplaced data re-read in the typesafe code. The reason that this was not noticed is that this is a performance test with no writers, so a mismatch could not occur. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/refscale.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 91a0fd0d4d9a..750a63e99539 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -655,12 +655,12 @@ retry: goto retry; } un_delay(udl, ndl); + b = READ_ONCE(rtsp->a); // Remember, seqlock read-side release can fail. if (!rts_release(rtsp, start)) { rcu_read_unlock(); goto retry; } - b = READ_ONCE(rtsp->a); WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b); b = rtsp->b; rcu_read_unlock(); From d6fea1dde2064c8f298dbe872587c7ace7701b3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Aug 2023 11:27:26 -0700 Subject: [PATCH 11/56] refscale: Print out additional module parameters The refscale.verbose_batched and refscale.lookup_instances module parameters are omitted from the ref_scale_print_module_parms() beginning-of-test output. This commit therefore adds them. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/refscale.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 750a63e99539..2c2648a3ad30 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -1025,8 +1025,8 @@ static void ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, - verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } static void From 8a4c0c90f2796fd3ac96326e8e2f8b13f7bdc99c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Aug 2023 11:29:05 -0700 Subject: [PATCH 12/56] doc: Add refscale.lookup_instances to kernel-parameters.txt This commit adds refscale.lookup_instances to kernel-parameters.txt. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0a1731a0f0ef..cf4f3262ce31 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5422,6 +5422,12 @@ test until boot completes in order to avoid interference. + refscale.lookup_instances= [KNL] + Number of data elements to use for the forms of + SLAB_TYPESAFE_BY_RCU testing. A negative number + is negated and multiplied by nr_cpu_ids, while + zero specifies nr_cpu_ids. + refscale.loops= [KNL] Set the number of loops over the synchronization primitive under test. Increasing this number From ebbb9d35fd8497591760779b3d5275fb4ce0e50d Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Tue, 8 Aug 2023 23:58:11 +0800 Subject: [PATCH 13/56] Documentation: RCU: Fix section numbers after adding Section 7 in whatisRCU.rst Signed-off-by: Wei Zhang Reviewed-by: Randy Dunlap Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- Documentation/RCU/whatisRCU.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index e488c8e557a9..60ce02475142 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -59,8 +59,8 @@ experiment with should focus on Section 2. People who prefer to start with example uses should focus on Sections 3 and 4. People who need to understand the RCU implementation should focus on Section 5, then dive into the kernel source code. People who reason best by analogy should -focus on Section 6. Section 7 serves as an index to the docbook API -documentation, and Section 8 is the traditional answer key. +focus on Section 6 and 7. Section 8 serves as an index to the docbook +API documentation, and Section 9 is the traditional answer key. So, start with the section that makes the most sense to you and your preferred method of learning. If you need to know everything about From 082acfe39cb0090e97bf27057d0efdf1e89abbef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Aug 2023 21:04:02 +0100 Subject: [PATCH 14/56] rcu: Describe listRCU read-side guarantees More explicitly state what is, and what is not guaranteed to those who iterate a list while protected by RCU. [ paulmck: Apply Joel Fernandes feedback. ] Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- Documentation/RCU/listRCU.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst index bdc4bcc5289f..ed5c9d8c9afe 100644 --- a/Documentation/RCU/listRCU.rst +++ b/Documentation/RCU/listRCU.rst @@ -8,6 +8,15 @@ One of the most common uses of RCU is protecting read-mostly linked lists that all of the required memory ordering is provided by the list macros. This document describes several list-based RCU use cases. +When iterating a list while holding the rcu_read_lock(), writers may +modify the list. The reader is guaranteed to see all of the elements +which were added to the list before they acquired the rcu_read_lock() +and are still on the list when they drop the rcu_read_unlock(). +Elements which are added to, or removed from the list may or may not +be seen. If the writer calls list_replace_rcu(), the reader may see +either the old element or the new element; they will not see both, +nor will they see neither. + Example 1: Read-mostly list: Deferred Destruction ------------------------------------------------- From ff18cb816427c3738d5d3fd23b7939a813935e15 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 25 Jul 2023 23:29:10 +0000 Subject: [PATCH 15/56] Revert "checkpatch: Error out if deprecated RCU API used" The definition for single-argument kfree_rcu() has been removed, so that any further attempt to use it will result in a build error. Because of this build error, there is no longer any need for a special check in checkpatch.pl. Therefore, revert commit 1eacac3255495be7502d406e2ba5444fb5c3607c. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- scripts/checkpatch.pl | 9 --------- 1 file changed, 9 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7d16f863edf1..25fdb7fda112 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6427,15 +6427,6 @@ sub process { } } -# check for soon-to-be-deprecated single-argument k[v]free_rcu() API - if ($line =~ /\bk[v]?free_rcu\s*\([^(]+\)/) { - if ($line =~ /\bk[v]?free_rcu\s*\([^,]+\)/) { - ERROR("DEPRECATED_API", - "Single-argument k[v]free_rcu() API is deprecated, please pass rcu_head object or call k[v]free_rcu_mightsleep()." . $herecurr); - } - } - - # check for unnecessary "Out of Memory" messages if ($line =~ /^\+.*\b$logFunctions\s*\(/ && $prevline =~ /^[ \+]\s*if\s*\(\s*(\!\s*|NULL\s*==\s*)?($Lval)(\s*==\s*NULL\s*)?\s*\)/ && From f0a31b26be1f725e3f56beed76486c1b034120b3 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 29 Jul 2023 14:27:32 +0000 Subject: [PATCH 16/56] srcu: Fix error handling in init_srcu_struct_fields() The current error handling in init_srcu_struct_fields() is a bit inconsistent. If init_srcu_struct_nodes() fails, the function either returns -ENOMEM or 0 depending on whether ssp->sda_is_static is true or false. This can make init_srcu_struct_fields() return 0 even if memory allocation failed! Simplify the error handling by always returning -ENOMEM if either init_srcu_struct_nodes() or the per-CPU allocation fails. This makes the control flow easier to follow and avoids the inconsistent return values. Add goto labels to avoid duplicating the error cleanup code. Link: https://lore.kernel.org/r/20230404003508.GA254019@google.com Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 20d7a238d675..f1a905200fc2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -255,29 +255,31 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); - if (!ssp->sda) { - if (!is_static) - kfree(ssp->srcu_sup); - return -ENOMEM; - } + if (!ssp->sda) + goto err_free_sup; init_srcu_struct_data(ssp); ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { - if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { - if (!ssp->srcu_sup->sda_is_static) { - free_percpu(ssp->sda); - ssp->sda = NULL; - kfree(ssp->srcu_sup); - return -ENOMEM; - } - } else { - WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); - } + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) + goto err_free_sda; + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } ssp->srcu_sup->srcu_ssp = ssp; smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ return 0; + +err_free_sda: + if (!is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } +err_free_sup: + if (!is_static) { + kfree(ssp->srcu_sup); + ssp->srcu_sup = NULL; + } + return -ENOMEM; } #ifdef CONFIG_DEBUG_LOCK_ALLOC From 4502138acc8f4139c94ce0ec0eee926f8805fbbc Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 29 Jul 2023 14:27:36 +0000 Subject: [PATCH 17/56] rcu/tree: Remove superfluous return from void call_rcu* functions The return keyword is not needed here. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb1caefa8bd0..7c79480bfaa0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2713,7 +2713,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) */ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { - return __call_rcu_common(head, func, false); + __call_rcu_common(head, func, false); } EXPORT_SYMBOL_GPL(call_rcu_hurry); #endif @@ -2764,7 +2764,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); + __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); } EXPORT_SYMBOL_GPL(call_rcu); From 16128b1f8c823438dcd3f3b4a57cbe7267bcf82f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Aug 2023 17:15:25 -0700 Subject: [PATCH 18/56] rcu: Add sysfs to provide throttled access to rcu_barrier() When running a series of stress tests all making heavy use of RCU, it is all too possible to OOM the system when the prior test's RCU callbacks don't get invoked until after the subsequent test starts. One way of handling this is just a timed wait, but this fails when a given CPU has so many callbacks queued that they take longer to invoke than allowed for by that timed wait. This commit therefore adds an rcutree.do_rcu_barrier module parameter that is accessible from sysfs. Writing one of the many synonyms for boolean "true" will cause an rcu_barrier() to be invoked, but will guarantee that no more than one rcu_barrier() will be invoked per sixteenth of a second via this mechanism. The flip side is that a given request might wait a second or three longer than absolutely necessary, but only when there are multiple uses of rcutree.do_rcu_barrier within a one-second time interval. This commit unnecessarily serializes the rcu_barrier() machinery, given that serialization is already provided by procfs. This has the advantage of allowing throttled rcu_barrier() from other sources within the kernel. Reported-by: Johannes Weiner Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- .../admin-guide/kernel-parameters.txt | 7 ++ kernel/rcu/tree.c | 76 +++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0a1731a0f0ef..7ec8a406d419 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4769,6 +4769,13 @@ Set maximum number of finished RCU callbacks to process in one batch. + rcutree.do_rcu_barrier= [KNL] + Request a call to rcu_barrier(). This is + throttled so that userspace tests can safely + hammer on the sysfs variable if they so choose. + If triggered before the RCU grace-period machinery + is fully active, this will error out with EAGAIN. + rcutree.dump_tree= [KNL] Dump the structure of the rcu_node combining tree out at early boot. This is used for diagnostic diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7c79480bfaa0..3c7281fc25a7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4083,6 +4083,82 @@ retry: } EXPORT_SYMBOL_GPL(rcu_barrier); +static unsigned long rcu_barrier_last_throttle; + +/** + * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second + * + * This can be thought of as guard rails around rcu_barrier() that + * permits unrestricted userspace use, at least assuming the hardware's + * try_cmpxchg() is robust. There will be at most one call per second to + * rcu_barrier() system-wide from use of this function, which means that + * callers might needlessly wait a second or three. + * + * This is intended for use by test suites to avoid OOM by flushing RCU + * callbacks from the previous test before starting the next. See the + * rcutree.do_rcu_barrier module parameter for more information. + * + * Why not simply make rcu_barrier() more scalable? That might be + * the eventual endpoint, but let's keep it simple for the time being. + * Note that the module parameter infrastructure serializes calls to a + * given .set() function, but should concurrent .set() invocation ever be + * possible, we are ready! + */ +static void rcu_barrier_throttled(void) +{ + unsigned long j = jiffies; + unsigned long old = READ_ONCE(rcu_barrier_last_throttle); + unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); + + while (time_in_range(j, old, old + HZ / 16) || + !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) { + schedule_timeout_idle(HZ / 16); + if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { + smp_mb(); /* caller's subsequent code after above check. */ + return; + } + j = jiffies; + old = READ_ONCE(rcu_barrier_last_throttle); + } + rcu_barrier(); +} + +/* + * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier + * request arrives. We insist on a true value to allow for possible + * future expansion. + */ +static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp) +{ + bool b; + int ret; + + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) + return -EAGAIN; + ret = kstrtobool(val, &b); + if (!ret && b) { + atomic_inc((atomic_t *)kp->arg); + rcu_barrier_throttled(); + atomic_dec((atomic_t *)kp->arg); + } + return ret; +} + +/* + * Output the number of outstanding rcutree.do_rcu_barrier requests. + */ +static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg)); +} + +static const struct kernel_param_ops do_rcu_barrier_ops = { + .set = param_set_do_rcu_barrier, + .get = param_get_do_rcu_barrier, +}; +static atomic_t do_rcu_barrier; +module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644); + /* * Compute the mask of online CPUs for the specified rcu_node structure. * This will not be stable unless the rcu_node structure's ->lock is From b93c5fe16e4aa177cd072c1c4652cbe1b19a7812 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 3 Aug 2023 21:49:19 +0800 Subject: [PATCH 19/56] rcu: Remove unused function declaration rcu_eqs_special_set() Commit a86baa69c2b7 ("rcu: Remove special bit at the bottom of the ->dynticks counter") left behind this, remove it. Signed-off-by: Yue Haibing Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- include/linux/rcutree.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 126f6b418f6a..153cfc7bbffd 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -37,7 +37,6 @@ void synchronize_rcu_expedited(void); void kvfree_call_rcu(struct rcu_head *head, void *ptr); void rcu_barrier(void); -bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); bool rcu_gp_might_be_stalled(void); From 6e284c55fc0bef7d25fd34d29db11f483da60ea4 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 5 Aug 2023 11:17:25 +0800 Subject: [PATCH 20/56] mm: Remove kmem_valid_obj() Function kmem_dump_obj() will splat if passed a pointer to a non-slab object. So nothing calls it directly, instead calling kmem_valid_obj() first to determine whether the passed pointer to a valid slab object. This means that merging kmem_valid_obj() into kmem_dump_obj() will make the code more concise. Therefore, convert kmem_dump_obj() to work the same way as vmalloc_dump_obj(), removing the need for the kmem_dump_obj() caller to check kmem_valid_obj(). After this, there are no remaining calls to kmem_valid_obj() anymore, and it can be safely removed. Suggested-by: Matthew Wilcox Signed-off-by: Zhen Lei Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- include/linux/slab.h | 5 +++-- mm/slab_common.c | 41 +++++++++++------------------------------ mm/util.c | 4 +--- 3 files changed, 15 insertions(+), 35 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 8228d1276a2f..ff56ab804bf6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -245,8 +245,9 @@ DEFINE_FREE(kfree, void *, if (_T) kfree(_T)) size_t ksize(const void *objp); #ifdef CONFIG_PRINTK -bool kmem_valid_obj(void *object); -void kmem_dump_obj(void *object); +bool kmem_dump_obj(void *object); +#else +static inline bool kmem_dump_obj(void *object) { return false; } #endif /* diff --git a/mm/slab_common.c b/mm/slab_common.c index cd71f9581e67..a425bedf2103 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -528,26 +528,6 @@ bool slab_is_available(void) } #ifdef CONFIG_PRINTK -/** - * kmem_valid_obj - does the pointer reference a valid slab object? - * @object: pointer to query. - * - * Return: %true if the pointer is to a not-yet-freed object from - * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer - * is to an already-freed object, and %false otherwise. - */ -bool kmem_valid_obj(void *object) -{ - struct folio *folio; - - /* Some arches consider ZERO_SIZE_PTR to be a valid address. */ - if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) - return false; - folio = virt_to_folio(object); - return folio_test_slab(folio); -} -EXPORT_SYMBOL_GPL(kmem_valid_obj); - static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { if (__kfence_obj_info(kpp, object, slab)) @@ -566,11 +546,11 @@ static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab * * and, if available, the slab name, return address, and stack trace from * the allocation and last free path of that object. * - * This function will splat if passed a pointer to a non-slab object. - * If you are not sure what type of object you have, you should instead - * use mem_dump_obj(). + * Return: %true if the pointer is to a not-yet-freed object from + * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer + * is to an already-freed object, and %false otherwise. */ -void kmem_dump_obj(void *object) +bool kmem_dump_obj(void *object) { char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc"; int i; @@ -578,13 +558,13 @@ void kmem_dump_obj(void *object) unsigned long ptroffset; struct kmem_obj_info kp = { }; - if (WARN_ON_ONCE(!virt_addr_valid(object))) - return; + /* Some arches consider ZERO_SIZE_PTR to be a valid address. */ + if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) + return false; slab = virt_to_slab(object); - if (WARN_ON_ONCE(!slab)) { - pr_cont(" non-slab memory.\n"); - return; - } + if (!slab) + return false; + kmem_obj_info(&kp, object, slab); if (kp.kp_slab_cache) pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); @@ -621,6 +601,7 @@ void kmem_dump_obj(void *object) pr_info(" %pS\n", kp.kp_free_stack[i]); } + return true; } EXPORT_SYMBOL_GPL(kmem_dump_obj); #endif diff --git a/mm/util.c b/mm/util.c index 8cbbfd3a3d59..6eddd891198e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1060,10 +1060,8 @@ void mem_dump_obj(void *object) { const char *type; - if (kmem_valid_obj(object)) { - kmem_dump_obj(object); + if (kmem_dump_obj(object)) return; - } if (vmalloc_dump_obj(object)) return; From 2cbc482d325ee58001472c4359b311958c4efdd1 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 5 Aug 2023 11:17:26 +0800 Subject: [PATCH 21/56] rcu: Dump memory object info if callback function is invalid When a structure containing an RCU callback rhp is (incorrectly) freed and reallocated after rhp is passed to call_rcu(), it is not unusual for rhp->func to be set to NULL. This defeats the debugging prints used by __call_rcu_common() in kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the offending code using the identity of this function. And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things are even worse, as can be seen from this splat: Unable to handle kernel NULL pointer dereference at virtual address 0 ... ... PC is at 0x0 LR is at rcu_do_batch+0x1c0/0x3b8 ... ... (rcu_do_batch) from (rcu_core+0x1d4/0x284) (rcu_core) from (__do_softirq+0x24c/0x344) (__do_softirq) from (__irq_exit_rcu+0x64/0x108) (__irq_exit_rcu) from (irq_exit+0x8/0x10) (irq_exit) from (__handle_domain_irq+0x74/0x9c) (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98) (gic_handle_irq) from (__irq_svc+0x5c/0x94) (__irq_svc) from (arch_cpu_idle+0x20/0x3c) (arch_cpu_idle) from (default_idle_call+0x4c/0x78) (default_idle_call) from (do_idle+0xf8/0x150) (do_idle) from (cpu_startup_entry+0x18/0x20) (cpu_startup_entry) from (0xc01530) This commit therefore adds calls to mem_dump_obj(rhp) to output some information, for example: slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256 This provides the rough size of the memory block and the offset of the rcu_head structure, which as least provides at least a few clues to help locate the problem. If the problem is reproducible, additional slab debugging can be enabled, for example, CONFIG_DEBUG_SLAB=y, which can provide significantly more information. Signed-off-by: Zhen Lei Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcu.h | 7 +++++++ kernel/rcu/srcutiny.c | 1 + kernel/rcu/srcutree.c | 1 + kernel/rcu/tasks.h | 1 + kernel/rcu/tiny.c | 1 + kernel/rcu/tree.c | 1 + 6 files changed, 12 insertions(+) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 98e13be411af..d612731feea4 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -10,6 +10,7 @@ #ifndef __LINUX_RCU_H #define __LINUX_RCU_H +#include #include /* @@ -248,6 +249,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void debug_rcu_head_callback(struct rcu_head *rhp) +{ + if (unlikely(!rhp->func)) + kmem_dump_obj(rhp); +} + extern int rcu_cpu_stall_suppress_at_boot; static inline bool rcu_stall_is_suppressed_at_boot(void) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 336af24e0fe3..c38e5933a5d6 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp) while (lh) { rhp = lh; lh = lh->next; + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index f1a905200fc2..833a8f848a90 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1710,6 +1710,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8d65f7d576a3..7c845532a50a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -538,6 +538,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); len = rcl.len; for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 42f7589e51e0..fec804b79080 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) trace_rcu_invoke_callback("", head); f = head->func; + debug_rcu_head_callback(head); WRITE_ONCE(head->func, (rcu_callback_t)0L); f(head); rcu_lock_release(&rcu_callback_map); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3c7281fc25a7..aae515071ffd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2135,6 +2135,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_invoke_callback(rcu_state.name, rhp); f = rhp->func; + debug_rcu_head_callback(rhp); WRITE_ONCE(rhp->func, (rcu_callback_t)0L); f(rhp); From 0ae9942f03d0d034fdb0a4f44fc99f62a3107987 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Aug 2023 08:53:58 -0700 Subject: [PATCH 22/56] rcu: Eliminate rcu_gp_slow_unregister() false positive When using rcutorture as a module, there are a number of conditions that can abort the modprobe operation, for example, when attempting to run both RCU CPU stall warning tests and forward-progress tests. This can cause rcu_torture_cleanup() to be invoked on the unwind path out of rcu_rcu_torture_init(), which will mean that rcu_gp_slow_unregister() is invoked without a matching rcu_gp_slow_register(). This will cause a splat because rcu_gp_slow_unregister() is passed rcu_fwd_cb_nodelay, which does not match a NULL pointer. This commit therefore forgives a mismatch involving a NULL pointer, thus avoiding this false-positive splat. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aae515071ffd..a83ecab77917 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1260,7 +1260,7 @@ EXPORT_SYMBOL_GPL(rcu_gp_slow_register); /* Unregister a counter, with NULL for not caring which. */ void rcu_gp_slow_unregister(atomic_t *rgssp) { - WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL); WRITE_ONCE(rcu_gp_slow_suppress, NULL); } From d0b654e19a83840e11d6ba87ea0be30e5fae343a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Jul 2023 12:03:20 -0700 Subject: [PATCH 23/56] torture: Share torture_random_state with torture_shuffle_tasks() Both torture_shuffle_tasks() and its caller torture_shuffle() define a torture_random_state structure. This is suboptimal given that torture_shuffle_tasks() runs for a very short period of time. This commit therefore causes torture_shuffle() to pass a pointer to its torture_random_state structure down to torture_shuffle_tasks(). Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/torture.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/torture.c b/kernel/torture.c index b28b05bbef02..68dba4ecab5c 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -520,9 +520,8 @@ static void torture_shuffle_task_unregister_all(void) * A special case is when shuffle_idle_cpu = -1, in which case we allow * the tasks to run on all CPUs. */ -static void torture_shuffle_tasks(void) +static void torture_shuffle_tasks(struct torture_random_state *trp) { - DEFINE_TORTURE_RANDOM(rand); struct shuffle_task *stp; cpumask_setall(shuffle_tmp_mask); @@ -543,7 +542,7 @@ static void torture_shuffle_tasks(void) mutex_lock(&shuffle_task_mutex); list_for_each_entry(stp, &shuffle_task_list, st_l) { - if (!random_shuffle || torture_random(&rand) & 0x1) + if (!random_shuffle || torture_random(trp) & 0x1) set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); } mutex_unlock(&shuffle_task_mutex); @@ -562,7 +561,7 @@ static int torture_shuffle(void *arg) VERBOSE_TOROUT_STRING("torture_shuffle task started"); do { torture_hrtimeout_jiffies(shuffle_interval, &rand); - torture_shuffle_tasks(); + torture_shuffle_tasks(&rand); torture_shutdown_absorb("torture_shuffle"); } while (!torture_must_stop()); torture_kthread_stopping("torture_shuffle"); @@ -673,7 +672,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) if (ssecs > 0) { shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); return torture_create_kthread(torture_shutdown, NULL, - shutdown_task); + shutdown_task); } return 0; } From 40baf39fc57c61554dc744d924257940574ac76b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jul 2023 14:45:08 -0700 Subject: [PATCH 24/56] torture: Make kvm-recheck.sh use mktemp This commit switches from the old "/tmp/kvm-recheck.sh.$$" approach to the newer and now reliable "mktemp" approach. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 5be670dd4009..de65d77b47ff 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -13,7 +13,7 @@ # # Authors: Paul E. McKenney -T=/tmp/kvm-recheck.sh.$$ +T="`mktemp ${TMPDIR-/tmp}/kvm-recheck.sh.XXXXXX`" trap 'rm -f $T' 0 2 configerrors=0 From a741deac787f0d2d7068638c067db20af9e63752 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Jul 2023 13:57:03 -0700 Subject: [PATCH 25/56] torture: Make torture_hrtimeout_ns() take an hrtimer mode parameter The current torture-test sleeps are waiting for a duration, but there are situations where it is better to wait for an absolute time, for example, when ending a stutter interval. This commit therefore adds an hrtimer mode parameter to torture_hrtimeout_ns(). Why not also the other torture_hrtimeout_*() functions? The theory is that most absolute times will be in nanoseconds, especially not (say) jiffies. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- include/linux/torture.h | 3 ++- kernel/torture.c | 13 +++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/linux/torture.h b/include/linux/torture.h index bb466eec01e4..017f0f710815 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -81,7 +81,8 @@ static inline void torture_random_init(struct torture_random_state *trsp) } /* Definitions for high-resolution-timer sleeps. */ -int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp); +int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode, + struct torture_random_state *trsp); int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp); int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp); int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp); diff --git a/kernel/torture.c b/kernel/torture.c index 68dba4ecab5c..6ba62e5993e7 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -87,14 +87,15 @@ EXPORT_SYMBOL_GPL(verbose_torout_sleep); * nanosecond random fuzz. This function and its friends desynchronize * testing from the timer wheel. */ -int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp) +int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode, + struct torture_random_state *trsp) { ktime_t hto = baset_ns; if (trsp) hto += torture_random(trsp) % fuzzt_ns; set_current_state(TASK_IDLE); - return schedule_hrtimeout(&hto, HRTIMER_MODE_REL); + return schedule_hrtimeout(&hto, mode); } EXPORT_SYMBOL_GPL(torture_hrtimeout_ns); @@ -106,7 +107,7 @@ int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state { ktime_t baset_ns = baset_us * NSEC_PER_USEC; - return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_us); @@ -123,7 +124,7 @@ int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state fuzzt_ns = (u32)~0U; else fuzzt_ns = fuzzt_us * NSEC_PER_USEC; - return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_ms); @@ -136,7 +137,7 @@ int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp) { ktime_t baset_ns = jiffies_to_nsecs(baset_j); - return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp); + return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies); @@ -153,7 +154,7 @@ int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state * fuzzt_ns = (u32)~0U; else fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC; - return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_s); From 3853a720f8bce9c5facb593e817f97659cc0512a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 3 Aug 2023 16:40:11 +0200 Subject: [PATCH 26/56] rcu: Include torture_sched_setaffinity() declaration The prototype for torture_sched_setaffinity() will be moved to a different header, which will need to be included from update.c to avoid this W=1 warning: kernel/rcu/update.c:529:6: error: no previous prototype for 'torture_sched_setaffinity' [-Werror=missing-prototypes] 529 | long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) Signed-off-by: Arnd Bergmann Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 19bf6fa3ee6a..9d3c2e6ba667 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include From 0cfecd7d754f2ef5d7e6b56ee656a8544ade920a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 31 Jul 2023 13:10:34 -0700 Subject: [PATCH 27/56] torture: Move rcutorture_sched_setaffinity() out of rcutorture The rcutorture_sched_setaffinity() function is needed by locktorture, so move its declaration from rcu.h to torture.h and rename it to the more generic torture_sched_setaffinity() name. Please note that use of this function is still restricted to torture tests, and of those, currently only rcutorture and locktorture. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- include/linux/torture.h | 5 +++++ kernel/rcu/rcu.h | 4 ---- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/update.c | 8 ++++---- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/torture.h b/include/linux/torture.h index 017f0f710815..c98d0c83d117 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -121,10 +121,15 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #define torture_stop_kthread(n, tp) \ _torture_stop_kthread("Stopping " #n " task", &(tp)) +/* Scheduler-related definitions. */ #ifdef CONFIG_PREEMPTION #define torture_preempt_schedule() __preempt_schedule() #else #define torture_preempt_schedule() do { } while (0) #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +#endif + #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 98e13be411af..567bd3d72e39 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -568,10 +568,6 @@ void do_trace_rcu_torture_read(const char *rcutorturename, static inline void rcu_gp_set_torture_wait(int duration) { } #endif -#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) -long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); -#endif - #ifdef CONFIG_TINY_SRCU static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ade42d6a9d9b..7e82fb887d09 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -810,7 +810,7 @@ static void synchronize_rcu_trivial(void) int cpu; for_each_online_cpu(cpu) { - rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + torture_sched_setaffinity(current->pid, cpumask_of(cpu)); WARN_ON_ONCE(raw_smp_processor_id() != cpu); } } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 9d3c2e6ba667..c534d6806d3d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -525,17 +525,17 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); do { } while (0) #endif -#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) /* Get rcutorture access to sched_setaffinity(). */ -long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { int ret; ret = sched_setaffinity(pid, in_mask); - WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); + WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); return ret; } -EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); +EXPORT_SYMBOL_GPL(torture_sched_setaffinity); #endif #ifdef CONFIG_RCU_STALL_COMMON From 73e3412424832f519004dd2176226982be35d34e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jul 2023 20:04:06 -0700 Subject: [PATCH 28/56] locktorture: Add readers_bind and writers_bind module parameters This commit adds readers_bind and writers_bind module parameters to locktorture in order to skew tests across socket boundaries. This skewing is intended to provide additional variable-latency stress on the primitive under test. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/locking/locktorture.c | 64 ++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 270c7f80ce84..441866259278 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -56,6 +56,55 @@ module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); +static cpumask_var_t readers_bind; // Bind the readers to the specified set of CPUs. +static cpumask_var_t writers_bind; // Bind the writers to the specified set of CPUs. + +// Parse a cpumask kernel parameter. If there are more users later on, +// this might need to got to a more central location. +static int param_set_cpumask(const char *val, const struct kernel_param *kp) +{ + cpumask_var_t *cm_bind = kp->arg; + int ret; + char *s; + + if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) { + s = "Out of memory"; + ret = -ENOMEM; + goto out_err; + } + ret = cpulist_parse(val, *cm_bind); + if (!ret) + return ret; + s = "Bad CPU range"; +out_err: + pr_warn("%s: %s, all CPUs set\n", kp->name, s); + cpumask_setall(*cm_bind); + return ret; +} + +// Output a cpumask kernel parameter. +static int param_get_cpumask(char *buffer, const struct kernel_param *kp) +{ + cpumask_var_t *cm_bind = kp->arg; + + return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind)); +} + +static bool cpumask_nonempty(cpumask_var_t mask) +{ + return cpumask_available(mask) && !cpumask_empty(mask); +} + +static const struct kernel_param_ops lt_bind_ops = { + .set = param_set_cpumask, + .get = param_get_cpumask, +}; + +module_param_cb(readers_bind, <_bind_ops, &readers_bind, 0644); +module_param_cb(writers_bind, <_bind_ops, &writers_bind, 0644); + +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); + static struct task_struct *stats_task; static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; @@ -986,16 +1035,23 @@ static int lock_torture_stats(void *arg) return 0; } + static inline void lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { + static cpumask_t cpumask_all; + cpumask_t *rcmp = cpumask_nonempty(readers_bind) ? readers_bind : &cpumask_all; + cpumask_t *wcmp = cpumask_nonempty(writers_bind) ? writers_bind : &cpumask_all; + + cpumask_setall(&cpumask_all); pr_alert("%s" TORTURE_FLAG - "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d readers_bind=%*pbl writers_bind=%*pbl\n", torture_type, tag, cxt.debug_lock ? " [debug]": "", cxt.nrealwriters_stress, cxt.nrealreaders_stress, nested_locks, stat_interval, verbose, shuffle_interval, - stutter, shutdown_secs, onoff_interval, onoff_holdoff); + stutter, shutdown_secs, onoff_interval, onoff_holdoff, + cpumask_pr_args(rcmp), cpumask_pr_args(wcmp)); } static void lock_torture_cleanup(void) @@ -1250,6 +1306,8 @@ static int __init lock_torture_init(void) writer_fifo ? sched_set_fifo : NULL); if (torture_init_error(firsterr)) goto unwind; + if (cpumask_nonempty(writers_bind)) + torture_sched_setaffinity(writer_tasks[i]->pid, writers_bind); create_reader: if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) @@ -1259,6 +1317,8 @@ static int __init lock_torture_init(void) reader_tasks[j]); if (torture_init_error(firsterr)) goto unwind; + if (cpumask_nonempty(readers_bind)) + torture_sched_setaffinity(reader_tasks[j]->pid, readers_bind); } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, From 65b73f1ff6d98a4a0d28c4c6f6497686fef557fa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Jul 2023 13:42:34 -0700 Subject: [PATCH 29/56] rcutorture: Add CONFIG_DEBUG_OBJECTS to RCU Tasks testing This commit adds CONFIG_DEBUG_OBJECTS=y to the TRACE02 rcutorture scenario to catch any further RCU Tasks bugs involving this Kconfig option. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/rcutorture/configs/rcu/TRACE02 | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 index 093ea6e8e65c..9003c56cd764 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -11,3 +11,4 @@ CONFIG_FORCE_TASKS_TRACE_RCU=y #CHECK#CONFIG_TASKS_TRACE_RCU=y CONFIG_TASKS_TRACE_RCU_READ_MB=n CONFIG_RCU_EXPERT=y +CONFIG_DEBUG_OBJECTS=y From cca42bd8eb1b54a4c9bbf48c79d120e66619a3e4 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 29 Jul 2023 14:27:31 +0000 Subject: [PATCH 30/56] rcutorture: Fix stuttering races and other issues The stuttering code isn't functioning as expected. Ideally, it should pause the torture threads for a designated period before resuming. Yet, it fails to halt the test for the correct duration. Additionally, a race condition exists, potentially causing the stuttering code to pause for an extended period if the 'spt' variable is non-zero due to the stutter orchestration thread's inadequate CPU time. Moreover, over-stuttering can hinder RCU's progress on TREE07 kernels. This happens as the stuttering code may run within a softirq due to RCU callbacks. Consequently, ksoftirqd keeps a CPU busy for several seconds, thus obstructing RCU's progress. This situation triggers a warning message in the logs: [ 2169.481783] rcu_torture_writer: rtort_pipe_count: 9 This warning suggests that an RCU torture object, although invisible to RCU readers, couldn't make it past the pipe array and be freed -- a strong indication that there weren't enough grace periods during the stutter interval. To address these issues, this patch sets the "stutter end" time to an absolute point in the future set by the main stutter thread. This is then used for waiting in stutter_wait(). While the stutter thread still defines this absolute time, the waiters' waiting logic doesn't rely on the stutter thread receiving sufficient CPU time to halt the stuttering as the halting is now self-controlled. Cc: stable@vger.kernel.org Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/torture.c | 45 ++++++++++++--------------------------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/kernel/torture.c b/kernel/torture.c index 6ba62e5993e7..fd353f98162f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -720,7 +720,7 @@ static void torture_shutdown_cleanup(void) * suddenly applied to or removed from the system. */ static struct task_struct *stutter_task; -static int stutter_pause_test; +static ktime_t stutter_till_abs_time; static int stutter; static int stutter_gap; @@ -730,30 +730,16 @@ static int stutter_gap; */ bool stutter_wait(const char *title) { - unsigned int i = 0; bool ret = false; - int spt; + ktime_t till_ns; cond_resched_tasks_rcu_qs(); - spt = READ_ONCE(stutter_pause_test); - for (; spt; spt = READ_ONCE(stutter_pause_test)) { - if (!ret && !rt_task(current)) { - sched_set_normal(current, MAX_NICE); - ret = true; - } - if (spt == 1) { - torture_hrtimeout_jiffies(1, NULL); - } else if (spt == 2) { - while (READ_ONCE(stutter_pause_test)) { - if (!(i++ & 0xffff)) - torture_hrtimeout_us(10, 0, NULL); - cond_resched(); - } - } else { - torture_hrtimeout_jiffies(round_jiffies_relative(HZ), NULL); - } - torture_shutdown_absorb(title); + till_ns = READ_ONCE(stutter_till_abs_time); + if (till_ns && ktime_before(ktime_get(), till_ns)) { + torture_hrtimeout_ns(till_ns, 0, HRTIMER_MODE_ABS, NULL); + ret = true; } + torture_shutdown_absorb(title); return ret; } EXPORT_SYMBOL_GPL(stutter_wait); @@ -764,23 +750,16 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { - DEFINE_TORTURE_RANDOM(rand); - int wtime; + ktime_t till_ns; VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop() && stutter > 1) { - wtime = stutter; - if (stutter > 2) { - WRITE_ONCE(stutter_pause_test, 1); - wtime = stutter - 3; - torture_hrtimeout_jiffies(wtime, &rand); - wtime = 2; - } - WRITE_ONCE(stutter_pause_test, 2); - torture_hrtimeout_jiffies(wtime, NULL); + till_ns = ktime_add_ns(ktime_get(), + jiffies_to_nsecs(stutter)); + WRITE_ONCE(stutter_till_abs_time, till_ns); + torture_hrtimeout_jiffies(stutter - 1, NULL); } - WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) torture_hrtimeout_jiffies(stutter_gap, NULL); torture_shutdown_absorb("torture_stutter"); From 31742a56c676b6014fde99a0cf07d3d12b822e9a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Aug 2023 15:30:31 -0700 Subject: [PATCH 31/56] locktorture: Alphabetize torture_param() entries There are getting to be too many module parameters for a random list to be comfortable, so this commit alphabetizes the list. Strictly code motion. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/locking/locktorture.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 441866259278..57ee16cf879d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -33,21 +33,21 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); -torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); -torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); +torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); +torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, rt_boost, 2, + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); +torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(int, rt_boost, 2, - "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); -torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); -torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ #define MAX_NESTED_LOCKS 8 From 84cee9e72e15b65921f05d41788d66a79c1baa25 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Aug 2023 15:42:03 -0700 Subject: [PATCH 32/56] locktorture: Consolidate "if" statements in lock_torture_writer() There is a pair of adjacent "if" statements with identical conditions in the lock_torture_writer() function. This commit therefore combines them. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/locking/locktorture.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 57ee16cf879d..c8c322e69a90 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -889,11 +889,10 @@ static int lock_torture_writer(void *arg) lock_is_write_held = true; if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) lwsp->n_lock_fail++; /* rare, but... */ - lwsp->n_lock_acquired++; - } - if (!skip_main_lock) { + cxt.cur_ops->write_delay(&rand); + lock_is_write_held = false; WRITE_ONCE(last_lock_release, jiffies); cxt.cur_ops->writeunlock(tid); From e3bdaefbccbd27b1829414604456611928a5e291 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Aug 2023 16:32:06 -0700 Subject: [PATCH 33/56] locktorture: Add acq_writer_lim to complain about long acquistion times This commit adds a locktorture.acq_writer_lim module parameter that specifies the maximum number of jiffies that is expected to be consumed by write-side lock acquisition. If this limit is exceeded, a WARN_ONCE() causes a splat. Note that this limit applies to the main lock acquisition only, not to any nested acquisitions. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/locking/locktorture.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index c8c322e69a90..296815ef67ae 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -33,6 +33,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); +torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies)."); torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); @@ -852,11 +853,13 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { */ static int lock_torture_writer(void *arg) { - struct lock_stress_stats *lwsp = arg; - int tid = lwsp - cxt.lwsa; - DEFINE_TORTURE_RANDOM(rand); + unsigned long j; + unsigned long j1; u32 lockset_mask; + struct lock_stress_stats *lwsp = arg; + DEFINE_TORTURE_RANDOM(rand); bool skip_main_lock; + int tid = lwsp - cxt.lwsa; VERBOSE_TOROUT_STRING("lock_torture_writer task started"); if (!rt_task(current)) @@ -883,12 +886,20 @@ static int lock_torture_writer(void *arg) cxt.cur_ops->nested_lock(tid, lockset_mask); if (!skip_main_lock) { + if (acq_writer_lim > 0) + j = jiffies; cxt.cur_ops->writelock(tid); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; lock_is_write_held = true; if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) lwsp->n_lock_fail++; /* rare, but... */ + if (acq_writer_lim > 0) { + j1 = jiffies; + WARN_ONCE(time_after(j1, j + acq_writer_lim), + "%s: Lock acquisition took %lu jiffies.\n", + __func__, j1 - j); + } lwsp->n_lock_acquired++; cxt.cur_ops->write_delay(&rand); From fcc7a329a7bfafe732586d876f16be5918b2b0e5 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 15 Aug 2023 19:09:49 +0000 Subject: [PATCH 34/56] rcutorture: Copy out ftrace into its own console file When debugging, it can be difficult to quickly find the ftrace dump within the console log, which in turn makes it difficult to process it independent of the rest of the console output. This commit therefore copies the contents of the buffers into its own file to make it easier to locate and process the ftrace dump. The original ftrace dump is still available in the console log in cases because it can be more convenient to process it in situ, for example, for scripts that process console output as well as ftrace-dump data. Also handle the case of multiple ftrace dumps potentially showing up in the log. Example for a file like [1], it will extract as [2]. [1]: foo foo Dumping ftrace buffer: --------------------------------- blah blah --------------------------------- more bar baz Dumping ftrace buffer: --------------------------------- blah2 blah2 --------------------------------- bleh bleh [2]: Ftrace dump 1: blah blah Ftrace dump 2: blah2 blah2 [ paulmck: Fixed awk indentation, input up front. ] Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- .../selftests/rcutorture/bin/functions.sh | 29 +++++++++++++++++++ .../selftests/rcutorture/bin/parse-console.sh | 7 +++++ 2 files changed, 36 insertions(+) mode change 100644 => 100755 tools/testing/selftests/rcutorture/bin/functions.sh diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh old mode 100644 new mode 100755 index b8e2ea23cb3f..6e415ddb206f --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -331,3 +331,32 @@ specify_qemu_net () { echo $1 -net none fi } + +# Extract the ftrace output from the console log output +# The ftrace output in the original logs look like: +# Dumping ftrace buffer: +# --------------------------------- +# [...] +# --------------------------------- +extract_ftrace_from_console() { + awk < "$1" ' + + /Dumping ftrace buffer:/ { + buffer_count++ + print "Ftrace dump " buffer_count ":" + capture = 1 + next + } + + /---------------------------------/ { + if(capture == 1) { + capture = 2 + next + } else if(capture == 2) { + capture = 0 + print "" + } + } + + capture == 2' +} diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 9ab0f6bc172c..e3d2f69ec0fb 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -182,3 +182,10 @@ if ! test -s $file.diags then rm -f $file.diags fi + +# Call extract_ftrace_from_console function, if the output is empty, +# don't create $file.ftrace. Otherwise output the results to $file.ftrace +extract_ftrace_from_console $file > $file.ftrace +if [ ! -s $file.ftrace ]; then + rm -f $file.ftrace +fi From 394473d876eaee0338a50249d5f807d4ae95e33c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Aug 2023 12:24:44 -0700 Subject: [PATCH 35/56] torture: Print out torture module parameters The kernel/torture.c module now has several module parameters, so this commit causes them to be printed out. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/torture.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/torture.c b/kernel/torture.c index fd353f98162f..c72ab2d251f4 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -791,6 +791,13 @@ static void torture_stutter_cleanup(void) stutter_task = NULL; } +static void +torture_print_module_parms(void) +{ + pr_alert("torture module --- %s: disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d\n", + torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle); +} + /* * Initialize torture module. Please note that this is -not- invoked via * the usual module_init() mechanism, but rather by an explicit call from @@ -813,6 +820,7 @@ bool torture_init_begin(char *ttype, int v) torture_type = ttype; verbose = v; fullstop = FULLSTOP_DONTSTOP; + torture_print_module_parms(); return true; } EXPORT_SYMBOL_GPL(torture_init_begin); From 92776c62408f354d254495ea61b3377b450dc69f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Aug 2023 14:20:49 -0700 Subject: [PATCH 36/56] torture: Make torture.sh refscale testing qualify verbose_batched In torture.sh, the testing of refscale incorrectly used verbose_batched as a kernel boot parameter, which causes this parameter to be passed to the init process. This commit therefore prefixes it with refscale, so that refscale.verbose_batched is passed to the kernel. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 12b50a4a881a..d5a0d8a33c27 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -472,7 +472,7 @@ do if test -n "$firsttime" then torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make + torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make mv $T/last-resdir-nodebug $T/first-resdir-nodebug || : if test -f "$T/last-resdir-kasan" then From 00c24c9cfa7895d6e712bea6f7109911cfdd54d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Aug 2023 16:42:21 -0700 Subject: [PATCH 37/56] locktorture: Add new module parameters to lock_torture_print_module_parms() This commit adds new module parameters to lock_torture_print_module_parms, and alphabetizes things while in the area. This change makes locktorture test results more useful and self-contained. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/locking/locktorture.c | 64 +++++++++++++----------------------- 1 file changed, 23 insertions(+), 41 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 296815ef67ae..d2a3a8cc1902 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -47,8 +47,8 @@ torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=d torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); +torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ #define MAX_NESTED_LOCKS 8 @@ -166,12 +166,9 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused) static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -244,15 +241,14 @@ __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; unsigned long j; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) { + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) { j = jiffies; - mdelay(longdelay_ms); + mdelay(long_hold); pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j); } if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) @@ -370,14 +366,12 @@ __acquires(torture_rwlock) static void torture_rwlock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); else udelay(shortdelay_us); } @@ -398,14 +392,12 @@ __acquires(torture_rwlock) static void torture_rwlock_read_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 10; - const unsigned long longdelay_ms = 100; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealreaders_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold))) + mdelay(long_hold); else udelay(shortdelay_us); } @@ -503,12 +495,9 @@ __acquires(torture_mutex) static void torture_mutex_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 5); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold * 5); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -676,15 +665,13 @@ __acquires(torture_rtmutex) static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* * We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); @@ -741,12 +728,9 @@ __acquires(torture_rwsem) static void torture_rwsem_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 10); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold * 10); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -766,14 +750,11 @@ __acquires(torture_rwsem) static void torture_rwsem_read_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealreaders_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 2); + if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold))) + mdelay(long_hold * 2); else - mdelay(longdelay_ms / 2); + mdelay(long_hold / 2); if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -1056,11 +1037,12 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, cpumask_setall(&cpumask_all); pr_alert("%s" TORTURE_FLAG - "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d readers_bind=%*pbl writers_bind=%*pbl\n", + "--- %s%s: acq_writer_lim=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d readers_bind=%*pbl writers_bind=%*pbl\n", torture_type, tag, cxt.debug_lock ? " [debug]": "", - cxt.nrealwriters_stress, cxt.nrealreaders_stress, - nested_locks, stat_interval, verbose, shuffle_interval, - stutter, shutdown_secs, onoff_interval, onoff_holdoff, + acq_writer_lim, long_hold, nested_locks, cxt.nrealreaders_stress, + cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost, + rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter, + verbose, writer_fifo, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp)); } From 7f993623e9ebcd633c0f760991e5078b95a37db3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Aug 2023 19:36:10 -0700 Subject: [PATCH 38/56] locktorture: Add call_rcu_chains module parameter When running locktorture on large systems, there will normally be enough RCU activity to ensure that there is a grace period in flight at all times. However, on smaller systems, RCU might well be idle the majority of the time. This situation can be inconvenient in cases where the RCU CPU stall warning is part of the debugging process. This commit therefore adds an call_rcu_chains module parameter to locktorture, allowing the user to specify the desired number of self-propagating call_rcu() chains. For good measure, immediately before invoking call_rcu(), the self-propagating RCU callback invokes start_poll_synchronize_rcu() to force the immediate start of a grace period, with the call_rcu() forcing another to start shortly thereafter. Booting with locktorture.call_rcu_chains=2 increases the probability of a stuck locking primitive resulting in an RCU CPU stall warning from about 25% to nearly 100%. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- .../admin-guide/kernel-parameters.txt | 7 +++ kernel/locking/locktorture.c | 62 ++++++++++++++++++- 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0a1731a0f0ef..300e2c30986c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2913,6 +2913,13 @@ to extract confidential information from the kernel are also disabled. + locktorture.call_rcu_chains= [KNL] + Specify the number of self-propagating call_rcu() + chains to set up. These are used to ensure that + there is a high probability of an RCU grace period + in progress at any given time. Defaults to 0, + which disables these call_rcu() chains. + locktorture.nreaders_stress= [KNL] Set the number of locking read-acquisition kthreads. Defaults to being automatically set based on the diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index d2a3a8cc1902..01d56e6c44d7 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -34,6 +34,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies)."); +torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable)."); torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); @@ -119,6 +120,12 @@ struct lock_stress_stats { long n_lock_acquired; }; +struct call_rcu_chain { + struct rcu_head crc_rh; + bool crc_stop; +}; +struct call_rcu_chain *call_rcu_chain; + /* Forward reference. */ static void lock_torture_cleanup(void); @@ -1037,15 +1044,60 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, cpumask_setall(&cpumask_all); pr_alert("%s" TORTURE_FLAG - "--- %s%s: acq_writer_lim=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d readers_bind=%*pbl writers_bind=%*pbl\n", + "--- %s%s: acq_writer_lim=%d call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d readers_bind=%*pbl writers_bind=%*pbl\n", torture_type, tag, cxt.debug_lock ? " [debug]": "", - acq_writer_lim, long_hold, nested_locks, cxt.nrealreaders_stress, + acq_writer_lim, call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress, cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost, rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter, verbose, writer_fifo, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp)); } +// If requested, maintain call_rcu() chains to keep a grace period always +// in flight. These increase the probability of getting an RCU CPU stall +// warning and associated diagnostics when a locking primitive stalls. + +static void call_rcu_chain_cb(struct rcu_head *rhp) +{ + struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh); + + if (!smp_load_acquire(&crcp->crc_stop)) { + (void)start_poll_synchronize_rcu(); // Start one grace period... + call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another. + } +} + +// Start the requested number of call_rcu() chains. +static int call_rcu_chain_init(void) +{ + int i; + + if (call_rcu_chains <= 0) + return 0; + call_rcu_chain = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain), GFP_KERNEL); + if (!call_rcu_chains) + return -ENOMEM; + for (i = 0; i < call_rcu_chains; i++) { + call_rcu_chain[i].crc_stop = false; + call_rcu(&call_rcu_chain[i].crc_rh, call_rcu_chain_cb); + } + return 0; +} + +// Stop all of the call_rcu() chains. +static void call_rcu_chain_cleanup(void) +{ + int i; + + if (!call_rcu_chain) + return; + for (i = 0; i < call_rcu_chains; i++) + smp_store_release(&call_rcu_chain[i].crc_stop, true); + rcu_barrier(); + kfree(call_rcu_chain); + call_rcu_chain = NULL; +} + static void lock_torture_cleanup(void) { int i; @@ -1096,6 +1148,8 @@ static void lock_torture_cleanup(void) kfree(cxt.lrsa); cxt.lrsa = NULL; + call_rcu_chain_cleanup(); + end: if (cxt.init_called) { if (cxt.cur_ops->exit) @@ -1225,6 +1279,10 @@ static int __init lock_torture_init(void) } } + firsterr = call_rcu_chain_init(); + if (torture_init_error(firsterr)) + goto unwind; + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ From b1326d766b437a122fb072106ba462569c192a59 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Aug 2023 20:17:26 -0700 Subject: [PATCH 39/56] doc: Catch-up update for locktorture module parameters This commit documents recently added locktorture module parameters. Signed-off-by: Paul E. McKenney Cc: Joel Fernandes Signed-off-by: Frederic Weisbecker --- .../admin-guide/kernel-parameters.txt | 50 +++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 300e2c30986c..47f8b1cef4fd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2913,6 +2913,11 @@ to extract confidential information from the kernel are also disabled. + locktorture.acq_writer_lim= [KNL] + Set the time limit in jiffies for a lock + acquisition. Acquisitions exceeding this limit + will result in a splat once they do complete. + locktorture.call_rcu_chains= [KNL] Specify the number of self-propagating call_rcu() chains to set up. These are used to ensure that @@ -2920,6 +2925,18 @@ in progress at any given time. Defaults to 0, which disables these call_rcu() chains. + locktorture.long_hold= [KNL] + Specify the duration in milliseconds for the + occasional long-duration lock hold time. Defaults + to 100 milliseconds. Select 0 to disable. + + locktorture.nested_locks= [KNL] + Specify the maximum lock nesting depth that + locktorture is to exercise, up to a limit of 8 + (MAX_NESTED_LOCKS). Specify zero to disable. + Note that this parameter is ineffective on types + of locks that do not support nested acquisition. + locktorture.nreaders_stress= [KNL] Set the number of locking read-acquisition kthreads. Defaults to being automatically set based on the @@ -2935,6 +2952,29 @@ Set time (s) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. + locktorture.readers_bind= [KNL] + Specify the list of CPUs to which the readers are + to be bound. + + locktorture.rt_boost= [KNL] + Do periodic testing of real-time lock priority + boosting. Select 0 to disable, 1 to boost + only rt_mutex, and 2 to boost unconditionally. + Defaults to 2, which might seem to be an + odd choice, but which should be harmless for + non-real-time spinlocks, due to their disabling + of preemption. Note that non-realtime mutexes + disable boosting. + + locktorture.rt_boost_factor= [KNL] + Number that determines how often and for how + long priority boosting is exercised. This is + scaled down by the number of writers, so that the + number of boosts per unit time remains roughly + constant as the number of writers increases. + On the other hand, the duration of each boost + increases with the number of writers. + locktorture.shuffle_interval= [KNL] Set task-shuffle interval (jiffies). Shuffling tasks allows some CPUs to go into dyntick-idle @@ -2957,13 +2997,17 @@ locktorture.torture_type= [KNL] Specify the locking implementation to test. + locktorture.verbose= [KNL] + Enable additional printk() statements. + + locktorture.writers_bind= [KNL] + Specify the list of CPUs to which the writers are + to be bound. + locktorture.writer_fifo= [KNL] Run the write-side locktorture kthreads at sched_set_fifo() real-time priority. - locktorture.verbose= [KNL] - Enable additional printk() statements. - logibm.irq= [HW,MOUSE] Logitech Bus Mouse Driver Format: From 2273799c292b033a20b0897e1ce2e2dee32304ef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Aug 2023 08:48:16 -0700 Subject: [PATCH 40/56] locktorture: Rename readers_bind/writers_bind to bind_readers/bind_writers This commit renames the readers_bind and writers_bind module parameters to bind_readers and bind_writers, respectively. This provides added clarity via the imperative mode and better organizes the documentation. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- .../admin-guide/kernel-parameters.txt | 16 +++++------ kernel/locking/locktorture.c | 28 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 47f8b1cef4fd..1d539c6d9d1c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2918,6 +2918,14 @@ acquisition. Acquisitions exceeding this limit will result in a splat once they do complete. + locktorture.bind_readers= [KNL] + Specify the list of CPUs to which the readers are + to be bound. + + locktorture.bind_writers= [KNL] + Specify the list of CPUs to which the writers are + to be bound. + locktorture.call_rcu_chains= [KNL] Specify the number of self-propagating call_rcu() chains to set up. These are used to ensure that @@ -2952,10 +2960,6 @@ Set time (s) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. - locktorture.readers_bind= [KNL] - Specify the list of CPUs to which the readers are - to be bound. - locktorture.rt_boost= [KNL] Do periodic testing of real-time lock priority boosting. Select 0 to disable, 1 to boost @@ -3000,10 +3004,6 @@ locktorture.verbose= [KNL] Enable additional printk() statements. - locktorture.writers_bind= [KNL] - Specify the list of CPUs to which the writers are - to be bound. - locktorture.writer_fifo= [KNL] Run the write-side locktorture kthreads at sched_set_fifo() real-time priority. diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 01d56e6c44d7..a3abcd136f56 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -58,8 +58,8 @@ module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); -static cpumask_var_t readers_bind; // Bind the readers to the specified set of CPUs. -static cpumask_var_t writers_bind; // Bind the writers to the specified set of CPUs. +static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs. +static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs. // Parse a cpumask kernel parameter. If there are more users later on, // this might need to got to a more central location. @@ -102,8 +102,8 @@ static const struct kernel_param_ops lt_bind_ops = { .get = param_get_cpumask, }; -module_param_cb(readers_bind, <_bind_ops, &readers_bind, 0644); -module_param_cb(writers_bind, <_bind_ops, &writers_bind, 0644); +module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); +module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); @@ -1039,18 +1039,18 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { static cpumask_t cpumask_all; - cpumask_t *rcmp = cpumask_nonempty(readers_bind) ? readers_bind : &cpumask_all; - cpumask_t *wcmp = cpumask_nonempty(writers_bind) ? writers_bind : &cpumask_all; + cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all; + cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all; cpumask_setall(&cpumask_all); pr_alert("%s" TORTURE_FLAG - "--- %s%s: acq_writer_lim=%d call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d readers_bind=%*pbl writers_bind=%*pbl\n", + "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n", torture_type, tag, cxt.debug_lock ? " [debug]": "", - acq_writer_lim, call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress, + acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp), + call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress, cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost, rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter, - verbose, writer_fifo, - cpumask_pr_args(rcmp), cpumask_pr_args(wcmp)); + verbose, writer_fifo); } // If requested, maintain call_rcu() chains to keep a grace period always @@ -1356,8 +1356,8 @@ static int __init lock_torture_init(void) writer_fifo ? sched_set_fifo : NULL); if (torture_init_error(firsterr)) goto unwind; - if (cpumask_nonempty(writers_bind)) - torture_sched_setaffinity(writer_tasks[i]->pid, writers_bind); + if (cpumask_nonempty(bind_writers)) + torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers); create_reader: if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) @@ -1367,8 +1367,8 @@ static int __init lock_torture_init(void) reader_tasks[j]); if (torture_init_error(firsterr)) goto unwind; - if (cpumask_nonempty(readers_bind)) - torture_sched_setaffinity(reader_tasks[j]->pid, readers_bind); + if (cpumask_nonempty(bind_readers)) + torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers); } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, From 30639bfdac3e7a7383b6133d8ea9a55e397715d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Aug 2023 12:50:10 -0700 Subject: [PATCH 41/56] torture: Add kvm.sh --debug-info argument This commit adds a --debug-info argument to kvm.sh in order to ease interpretation of addresses printed on the console and the like. This argument also disables KASLR. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/rcutorture/bin/kvm.sh | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index b0f36a638a69..7af73ddc148d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -49,6 +49,7 @@ TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu TORTURE_MOD=rcutorture TORTURE_TRUST_MAKE="" +debuginfo="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y" resdir="" configs="" cpus=0 @@ -68,6 +69,7 @@ usage () { echo " --cpus N" echo " --datestamp string" echo " --defconfig string" + echo " --debug-info" echo " --dryrun batches|scenarios|sched|script" echo " --duration minutes | s | h | d" echo " --gdb" @@ -135,6 +137,15 @@ do ds=$2 shift ;; + --debug-info|--debuginfo) + if test -z "$TORTURE_KCONFIG_KCSAN_ARG" && test -z "$TORTURE_BOOT_GDB_ARG" + then + TORTURE_KCONFIG_KCSAN_ARG="$debuginfo"; export TORTURE_KCONFIG_KCSAN_ARG + TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG + else + echo "Ignored redundant --debug-info (implied by --kcsan &c)" + fi + ;; --defconfig) checkarg --defconfig "defconfigtype" "$#" "$2" '^[^/][^/]*$' '^--' TORTURE_DEFCONFIG=$2 @@ -163,7 +174,7 @@ do shift ;; --gdb) - TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y"; export TORTURE_KCONFIG_GDB_ARG + TORTURE_KCONFIG_GDB_ARG="$debuginfo"; export TORTURE_KCONFIG_GDB_ARG TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG TORTURE_QEMU_GDB_ARG="-s -S"; export TORTURE_QEMU_GDB_ARG ;; @@ -179,7 +190,7 @@ do shift ;; --kasan) - TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG + TORTURE_KCONFIG_KASAN_ARG="$debuginfo CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG if test -n "$torture_qemu_mem_default" then TORTURE_QEMU_MEM=2G @@ -191,7 +202,7 @@ do shift ;; --kcsan) - TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG + TORTURE_KCONFIG_KCSAN_ARG="$debuginfo CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG ;; --kmake-arg|--kmake-args) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' From 66bcb1321b104cce9c767087656e16883f17dfde Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 16 Aug 2023 20:49:12 +0000 Subject: [PATCH 42/56] rcutorture: Replace schedule_timeout*() 1-jiffy waits with HZ/20 In the past, spinning on schedule_timeout* with a wait of 1 jiffy has hung the kernel. See for example d52d3a2bf408 ("torture: Fix hang during kthread shutdown phase"). This issue recently recurred in torture's stutter code. The result is that the function instantly returns and never goes to sleep, preempting whatever might otherwise make useful forward progress. To prevent future issues, apply the commit-d52d3a2bf408 fix throughout rcutorture, moving from a 1-jiffy wait to a 50-millisecond wait. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcutorture.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7e82fb887d09..8136fec0310b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1149,7 +1149,7 @@ static int rcu_torture_boost(void *arg) mutex_unlock(&boost_mutex); break; } - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } /* Go do the stutter. */ @@ -1160,7 +1160,7 @@ checkwait: if (stutter_wait("rcu_torture_boost")) /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } torture_kthread_stopping("rcu_torture_boost"); return 0; @@ -1183,7 +1183,7 @@ rcu_torture_fqs(void *arg) fqs_resume_time = jiffies + fqs_stutter * HZ; while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(HZ / 20); } fqs_burst_remaining = fqs_duration; while (fqs_burst_remaining > 0 && @@ -2899,7 +2899,7 @@ static int rcu_torture_fwd_prog(void *args) WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); } else { while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(HZ / 20); oldseq = READ_ONCE(rcu_fwd_seq); } pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); @@ -3200,7 +3200,7 @@ static int rcu_torture_read_exit_child(void *trsp_in) set_user_nice(current, MAX_NICE); // Minimize time between reading and exiting. while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); (void)rcu_torture_one_read(trsp, -1); return 0; } @@ -3248,7 +3248,7 @@ static int rcu_torture_read_exit(void *unused) smp_mb(); // Store before wakeup. wake_up(&read_exit_wq); while (!torture_must_stop()) - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); torture_kthread_stopping("rcu_torture_read_exit"); return 0; } From 771a92b85a388b751bb9473300ea24c53f54387e Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 24 Aug 2023 16:42:06 +0800 Subject: [PATCH 43/56] rcutorture: Traverse possible cpu to set maxcpu in rcu_nocb_toggle() Currently, the maxcpu is set by traversing online CPUs, however, if the rcutorture.onoff_holdoff is set zero and onoff_interval is set non-zero, and the some CPUs with larger cpuid has been offline before setting maxcpu, for these CPUs, even if they are online again, also cannot be offload or deoffload. This can result in rcutorture attempting to (de-)offload CPUs that have never been online, but the (de-)offload code handles this. This commit therefore use for_each_possible_cpu() instead of for_each_online_cpu() in rcu_nocb_toggle(). Signed-off-by: Zqiang Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8136fec0310b..b17ad45cd67e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2126,7 +2126,7 @@ static int rcu_nocb_toggle(void *arg) VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started"); while (!rcu_inkernel_boot_has_ended()) schedule_timeout_interruptible(HZ / 10); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); if (toggle_interval > ULONG_MAX) From 3e9b009c168e2448ab16f98f10045360b5b41816 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Sep 2023 05:13:18 -0700 Subject: [PATCH 44/56] torture: Convert parse-console.sh to mktemp This commit does the long-overdue conversion of the parse-console.sh file to use mktemp to create its temporary directory. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/rcutorture/bin/parse-console.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index e3d2f69ec0fb..b07c11cf6929 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -11,7 +11,7 @@ # # Authors: Paul E. McKenney -T=${TMPDIR-/tmp}/parse-console.sh.$$ +T="`mktemp -d ${TMPDIR-/tmp}/parse-console.sh.XXXXXX`" file="$1" title="$2" From d8d5b7bf6f2105883bbd91bbd4d5b67e4e3dff71 Mon Sep 17 00:00:00 2001 From: Denis Arefev Date: Mon, 4 Sep 2023 15:21:14 +0300 Subject: [PATCH 45/56] srcu: Fix srcu_struct node grpmask overflow on 64-bit systems The value of a bitwise expression 1 << (cpu - sdp->mynode->grplo) is subject to overflow due to a failure to cast operands to a larger data type before performing the bitwise operation. The maximum result of this subtraction is defined by the RCU_FANOUT_LEAF Kconfig option, which on 64-bit systems defaults to 16 (resulting in a maximum shift of 15), but which can be set up as high as 64 (resulting in a maximum shift of 63). A value of 31 can result in sign extension, resulting in 0xffffffff80000000 instead of the desired 0x80000000. A value of 32 or greater triggers undefined behavior per the C standard. This bug has not been known to cause issues because almost all kernels take the default CONFIG_RCU_FANOUT_LEAF=16. Furthermore, as long as a given compiler gives a deterministic non-zero result for 1<=32, the code correctly invokes all SRCU callbacks, albeit wasting CPU time along the way. This commit therefore substitutes the correct 1UL for the buggy 1. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Denis Arefev Reviewed-by: Mathieu Desnoyers Reviewed-by: Joel Fernandes (Google) Cc: David Laight Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 833a8f848a90..5602042856b1 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -223,7 +223,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) snp->grplo = cpu; snp->grphi = cpu; } - sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); + sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo); } smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); return true; @@ -835,7 +835,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp int cpu; for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - if (!(mask & (1 << (cpu - snp->grplo)))) + if (!(mask & (1UL << (cpu - snp->grplo)))) continue; srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } From 5f98fd034ca6fd1ab8c91a3488968a0e9caaabf6 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Sat, 30 Sep 2023 17:46:56 +0000 Subject: [PATCH 46/56] rcu: kmemleak: Ignore kmemleak false positives when RCU-freeing objects Since the actual slab freeing is deferred when calling kvfree_rcu(), so is the kmemleak_free() callback informing kmemleak of the object deletion. From the perspective of the kvfree_rcu() caller, the object is freed and it may remove any references to it. Since kmemleak does not scan RCU internal data storing the pointer, it will report such objects as leaks during the grace period. Tell kmemleak to ignore such objects on the kvfree_call_rcu() path. Note that the tiny RCU implementation does not have such issue since the objects can be tracked from the rcu_ctrlblk structure. Signed-off-by: Catalin Marinas Reported-by: Christoph Paasch Closes: https://lore.kernel.org/all/F903A825-F05F-4B77-A2B5-7356282FBA2C@apple.com/ Cc: Tested-by: Christoph Paasch Reviewed-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a83ecab77917..4dd7df30df31 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -3389,6 +3390,14 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) success = true; } + /* + * The kvfree_rcu() caller considers the pointer freed at this point + * and likely removes any references to it. Since the actual slab + * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore + * this object (no scanning or false positives reporting). + */ + kmemleak_ignore(ptr); + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) schedule_delayed_monitor_work(krcp); From 7df2a2a024145d0dcc1e51dc378c527000b35b07 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:35:54 +0200 Subject: [PATCH 47/56] rcu: Use rcu_segcblist_segempty() instead of open coding it This makes the code more readable. Reviewed-by: Qiuxu Zhuo Reviewed-by: Joel Fernandes (Google) Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcu_segcblist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index f71fac422c8f..1693ea22ef1b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -368,7 +368,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1]) + if (!rcu_segcblist_segempty(rsclp, i)) break; rcu_segcblist_inc_seglen(rsclp, i); WRITE_ONCE(*rsclp->tails[i], rhp); @@ -551,7 +551,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * as their ->gp_seq[] grace-period completion sequence number. */ for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1] && + if (!rcu_segcblist_segempty(rsclp, i) && ULONG_CMP_LT(rsclp->gp_seq[i], seq)) break; From 358662a9616c5078dc4d389d6bceeb5974f4aa97 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:35:58 +0200 Subject: [PATCH 48/56] rcu: Assume IRQS disabled from rcu_report_dead() rcu_report_dead() is the last RCU word from the CPU down through the hotplug path. It is called in the idle loop right before the CPU shuts down for good. Because it removes the CPU from the grace period state machine and reports an ultimate quiescent state if necessary, no further use of RCU is allowed. Therefore it is expected that IRQs are disabled upon calling this function and are not to be re-enabled again until the CPU shuts down. Remove the IRQs disablement from that function and verify instead that it is actually called with IRQs disabled as it is expected at that special point in the idle path. Reviewed-by: Joel Fernandes (Google) Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4dd7df30df31..8c2954502e55 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4562,11 +4562,16 @@ void rcu_cpu_starting(unsigned int cpu) */ void rcu_report_dead(unsigned int cpu) { - unsigned long flags, seq_flags; + unsigned long flags; unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + /* + * IRQS must be disabled from now on and until the CPU dies, or an interrupt + * may introduce a new READ-side while it is actually off the QS masks. + */ + lockdep_assert_irqs_disabled(); // Do any dangling deferred wakeups. do_nocb_deferred_wakeup(rdp); @@ -4574,7 +4579,6 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - local_irq_save(seq_flags); arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4588,8 +4592,6 @@ void rcu_report_dead(unsigned int cpu) WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(seq_flags); - rdp->cpu_started = false; } From c964c1f5ee96e1460606d44f80a47bdacd8fe568 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:35:59 +0200 Subject: [PATCH 49/56] rcu: Assume rcu_report_dead() is always called locally rcu_report_dead() has to be called locally by the CPU that is going to exit the RCU state machine. Passing a cpu argument here is error-prone and leaves the possibility for a racy remote call. Use local access instead. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- arch/arm64/kernel/smp.c | 2 +- include/linux/rcupdate.h | 2 +- kernel/cpu.c | 2 +- kernel/rcu/tree.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 960b98b43506..8fa646c90c67 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -401,7 +401,7 @@ void __noreturn cpu_die_early(void) /* Mark this CPU absent */ set_cpu_present(cpu, 0); - rcu_report_dead(cpu); + rcu_report_dead(); if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { update_cpu_boot_status(CPU_KILL_ME); diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5e5f920ade90..aa351ddcbe8d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -122,7 +122,7 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); -void rcu_report_dead(unsigned int cpu); +void rcu_report_dead(void); void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_TASKS_RCU_GENERIC diff --git a/kernel/cpu.c b/kernel/cpu.c index 6de7c6bb74ee..076e75fed8bb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1388,7 +1388,7 @@ void cpuhp_report_idle_dead(void) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); BUG_ON(st->state != CPUHP_AP_OFFLINE); - rcu_report_dead(smp_processor_id()); + rcu_report_dead(); st->state = CPUHP_AP_IDLE_DEAD; /* * We cannot call complete after rcu_report_dead() so we delegate it diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8c2954502e55..2e1e7eadf2cc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4560,11 +4560,11 @@ void rcu_cpu_starting(unsigned int cpu) * from the outgoing CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. */ -void rcu_report_dead(unsigned int cpu) +void rcu_report_dead(void) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* From a9930e85290ef0a3ebefc90b30638722db3a947c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:36:02 +0200 Subject: [PATCH 50/56] rcu: Remove references to rcu_migrate_callbacks() from diagrams This function is gone since: 53b46303da84 (rcu: Remove rsp parameter from rcu_boot_init_percpu_data() and friends) Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- .../Design/Memory-Ordering/TreeRCU-callback-registry.svg | 9 --------- Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg | 9 --------- 2 files changed, 18 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg index 7ac6f9269806..63eff867175a 100644 --- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg @@ -564,15 +564,6 @@ font-size="192" id="text202-7-9-6" style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_migrate_callbacks() - rcu_migrate_callbacks() rcutree_migrate_callbacks() - rcu_migrate_callbacks() Date: Fri, 8 Sep 2023 22:36:00 +0200 Subject: [PATCH 51/56] rcu: Conditionally build CPU-hotplug teardown callbacks Among the three CPU-hotplug teardown RCU callbacks, two of them early exit if CONFIG_HOTPLUG_CPU=n, and one is left unchanged. In any case all of them have an implementation when CONFIG_HOTPLUG_CPU=n. Align instead with the common way to deal with CPU-hotplug teardown callbacks and provide a proper stub when they are not supported. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- include/linux/rcutree.h | 13 +++-- kernel/rcu/tree.c | 114 +++++++++++++++++++--------------------- 2 files changed, 64 insertions(+), 63 deletions(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 153cfc7bbffd..46875c4e9f56 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -110,9 +110,16 @@ void rcu_all_qs(void); /* RCUtree hotplug events */ int rcutree_prepare_cpu(unsigned int cpu); int rcutree_online_cpu(unsigned int cpu); -int rcutree_offline_cpu(unsigned int cpu); -int rcutree_dead_cpu(unsigned int cpu); -int rcutree_dying_cpu(unsigned int cpu); void rcu_cpu_starting(unsigned int cpu); +#ifdef CONFIG_HOTPLUG_CPU +int rcutree_dead_cpu(unsigned int cpu); +int rcutree_dying_cpu(unsigned int cpu); +int rcutree_offline_cpu(unsigned int cpu); +#else +#define rcutree_dead_cpu NULL +#define rcutree_dying_cpu NULL +#define rcutree_offline_cpu NULL +#endif + #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2e1e7eadf2cc..f9c6b2680cbb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4237,25 +4237,6 @@ static bool rcu_init_invoked(void) return !!rcu_state.n_online_cpus; } -/* - * Near the end of the offline process. Trace the fact that this CPU - * is going offline. - */ -int rcutree_dying_cpu(unsigned int cpu) -{ - bool blkd; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), - blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); - return 0; -} - /* * All CPUs for the specified rcu_node structure have gone offline, * and all tasks that were preempted within an RCU read-side critical @@ -4301,23 +4282,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) } } -/* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup. - * There can only be one CPU hotplug operation at a time, so no need for - * explicit locking. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); - // Stop-machine done, so allow nohz_full to disable tick. - tick_dep_clear(TICK_DEP_BIT_RCU); - return 0; -} - /* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller @@ -4470,29 +4434,6 @@ int rcutree_online_cpu(unsigned int cpu) return 0; } -/* - * Near the beginning of the process. The CPU is still very much alive - * with pretty much all services enabled. - */ -int rcutree_offline_cpu(unsigned int cpu) -{ - unsigned long flags; - struct rcu_data *rdp; - struct rcu_node *rnp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->ffmask &= ~rdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - rcutree_affinity_setting(cpu, cpu); - - // nohz_full CPUs need the tick for stop-machine to work quickly - tick_dep_set(TICK_DEP_BIT_RCU); - return 0; -} - /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -4646,7 +4587,60 @@ void rcutree_migrate_callbacks(int cpu) cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); } -#endif + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. + */ +int rcutree_dead_cpu(unsigned int cpu) +{ + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); + return 0; +} + +/* + * Near the end of the offline process. Trace the fact that this CPU + * is going offline. + */ +int rcutree_dying_cpu(unsigned int cpu) +{ + bool blkd; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp = rdp->mynode; + + blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); + return 0; +} + +/* + * Near the beginning of the process. The CPU is still very much alive + * with pretty much all services enabled. + */ +int rcutree_offline_cpu(unsigned int cpu) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + rcutree_affinity_setting(cpu, cpu); + + // nohz_full CPUs need the tick for stop-machine to work quickly + tick_dep_set(TICK_DEP_BIT_RCU); + return 0; +} +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* * On non-huge systems, use expedited RCU grace periods to make suspend From 448e9f34d91d1a4799fdb06a93c2c24b34b6fd9d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:36:01 +0200 Subject: [PATCH 52/56] rcu: Standardize explicit CPU-hotplug calls rcu_report_dead() and rcutree_migrate_callbacks() have their headers in rcupdate.h while those are pure rcutree calls, like the other CPU-hotplug functions. Also rcu_cpu_starting() and rcu_report_dead() have different naming conventions while they mirror each other's effects. Fix the headers and propose a naming that relates both functions and aligns with the prefix of other rcutree CPU-hotplug functions. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- .../Expedited-Grace-Periods.rst | 2 +- .../RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg | 4 ++-- .../RCU/Design/Memory-Ordering/TreeRCU-gp.svg | 4 ++-- .../RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg | 4 ++-- .../RCU/Design/Requirements/Requirements.rst | 4 ++-- arch/arm64/kernel/smp.c | 4 ++-- arch/powerpc/kernel/smp.c | 2 +- arch/s390/kernel/smp.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- include/linux/interrupt.h | 2 +- include/linux/rcupdate.h | 2 -- include/linux/rcutiny.h | 2 +- include/linux/rcutree.h | 7 ++++++- kernel/cpu.c | 6 +++--- kernel/rcu/tree.c | 12 ++++++++---- 15 files changed, 33 insertions(+), 26 deletions(-) diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst index 93d899d53258..414f8a2012d6 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst @@ -181,7 +181,7 @@ operations is carried out at several levels: of this wait (or series of waits, as the case may be) is to permit a concurrent CPU-hotplug operation to complete. #. In the case of RCU-sched, one of the last acts of an outgoing CPU is - to invoke ``rcu_report_dead()``, which reports a quiescent state for + to invoke ``rcutree_report_cpu_dead()``, which reports a quiescent state for that CPU. However, this is likely paranoia-induced redundancy. +-----------------------------------------------------------------------+ diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg index 7ddc094d7f28..d82a77d03d8c 100644 --- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg @@ -1135,7 +1135,7 @@ font-weight="bold" font-size="192" id="text202-7-5-3-27-6-5" - style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead() rcu_cpu_starting() + xml:space="preserve">rcutree_report_cpu_starting() rcu_report_dead() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead() rcu_cpu_starting() + xml:space="preserve">rcutree_report_cpu_starting() rcu_report_dead() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead() rcu_cpu_starting() + xml:space="preserve">rcutree_report_cpu_starting() setup_cpu) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index a4edb7ea66ea..214a1b67f80a 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -898,7 +898,7 @@ static void smp_start_secondary(void *cpuvoid) S390_lowcore.restart_flags = 0; restore_access_regs(S390_lowcore.access_regs_save_area); cpu_init(); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); init_cpu_timer(); vtime_init(); vdso_getcpu_init(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4e45ff44aa07..4ccb76f89af8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -288,7 +288,7 @@ static void notrace start_secondary(void *unused) cpu_init(); fpu__init_cpu(); - rcu_cpu_starting(raw_smp_processor_id()); + rcutree_report_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); ap_starting(); diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a92bce40b04b..d05e1e9a553c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -566,7 +566,7 @@ enum * * _ RCU: * 1) rcutree_migrate_callbacks() migrates the queue. - * 2) rcu_report_dead() reports the final quiescent states. + * 2) rcutree_report_cpu_dead() reports the final quiescent states. * * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index aa351ddcbe8d..f7206b2623c9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -122,8 +122,6 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); -void rcu_report_dead(void); -void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_TASKS_RCU_GENERIC void rcu_init_tasks_generic(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 7b949292908a..d9ac7b136aea 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -171,6 +171,6 @@ static inline void rcu_all_qs(void) { barrier(); } #define rcutree_offline_cpu NULL #define rcutree_dead_cpu NULL #define rcutree_dying_cpu NULL -static inline void rcu_cpu_starting(unsigned int cpu) { } +static inline void rcutree_report_cpu_starting(unsigned int cpu) { } #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 46875c4e9f56..254244202ea9 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -110,7 +110,7 @@ void rcu_all_qs(void); /* RCUtree hotplug events */ int rcutree_prepare_cpu(unsigned int cpu); int rcutree_online_cpu(unsigned int cpu); -void rcu_cpu_starting(unsigned int cpu); +void rcutree_report_cpu_starting(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int rcutree_dead_cpu(unsigned int cpu); @@ -122,4 +122,9 @@ int rcutree_offline_cpu(unsigned int cpu); #define rcutree_offline_cpu NULL #endif +void rcutree_migrate_callbacks(int cpu); + +/* Called from hotplug and also arm64 early secondary boot failure */ +void rcutree_report_cpu_dead(void); + #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 076e75fed8bb..2491766e1fd5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1388,10 +1388,10 @@ void cpuhp_report_idle_dead(void) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); BUG_ON(st->state != CPUHP_AP_OFFLINE); - rcu_report_dead(); + rcutree_report_cpu_dead(); st->state = CPUHP_AP_IDLE_DEAD; /* - * We cannot call complete after rcu_report_dead() so we delegate it + * We cannot call complete after rcutree_report_cpu_dead() so we delegate it * to an online cpu. */ smp_call_function_single(cpumask_first(cpu_online_mask), @@ -1617,7 +1617,7 @@ void notify_cpu_starting(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ + rcutree_report_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ cpumask_set_cpu(cpu, &cpus_booted_once_mask); /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c6b2680cbb..36d8818eaec1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4216,7 +4216,7 @@ bool rcu_lockdep_current_cpu_online(void) rdp = this_cpu_ptr(&rcu_data); /* * Strictly, we care here about the case where the current CPU is - * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask + * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask * not being up to date. So arch_spin_is_locked() might have a * false positive if it's held by some *other* CPU, but that's * OK because that just means a false *negative* on the warning. @@ -4445,8 +4445,10 @@ int rcutree_online_cpu(unsigned int cpu) * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. * This incoming CPU must not have enabled interrupts yet. + * + * This mirrors the effects of rcutree_report_cpu_dead(). */ -void rcu_cpu_starting(unsigned int cpu) +void rcutree_report_cpu_starting(unsigned int cpu) { unsigned long mask; struct rcu_data *rdp; @@ -4500,8 +4502,10 @@ void rcu_cpu_starting(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the outgoing CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * + * This mirrors the effect of rcutree_report_cpu_starting(). */ -void rcu_report_dead(void) +void rcutree_report_cpu_dead(void) { unsigned long flags; unsigned long mask; @@ -5072,7 +5076,7 @@ void __init rcu_init(void) pm_notifier(rcu_pm_notify, 0); WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ From a28ab03b499601c14b8f502d875c2bee23209659 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:36:03 +0200 Subject: [PATCH 53/56] rcu: Comment why callbacks migration can't wait for CPUHP_RCUTREE_PREP The callbacks migration is performed through an explicit call from the hotplug control CPU right after the death of the target CPU and before proceeding with the CPUHP_ teardown functions. This is unusual but necessary and yet uncommented. Summarize the reason as explained in the changelog of: a58163d8ca2c (rcu: Migrate callbacks earlier in the CPU-offline timeline) Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 2491766e1fd5..3b9d5c7eb4a2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1372,7 +1372,14 @@ static int takedown_cpu(unsigned int cpu) cpuhp_bp_sync_dead(cpu); tick_cleanup_dead_cpu(cpu); + + /* + * Callbacks must be re-integrated right away to the RCU state machine. + * Otherwise an RCU callback could block a further teardown function + * waiting for its completion. + */ rcutree_migrate_callbacks(cpu); + return 0; } From 4a8e65b0c348e42107c64381e692e282900be361 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Oct 2023 01:28:59 +0200 Subject: [PATCH 54/56] srcu: Fix callbacks acceleration mishandling SRCU callbacks acceleration might fail if the preceding callbacks advance also fails. This can happen when the following steps are met: 1) The RCU_WAIT_TAIL segment has callbacks (say for gp_num 8) and the RCU_NEXT_READY_TAIL also has callbacks (say for gp_num 12). 2) The grace period for RCU_WAIT_TAIL is observed as started but not yet completed so rcu_seq_current() returns 4 + SRCU_STATE_SCAN1 = 5. 3) This value is passed to rcu_segcblist_advance() which can't move any segment forward and fails. 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. But then the call to rcu_seq_snap() observes the grace period for the RCU_WAIT_TAIL segment (gp_num 8) as completed and the subsequent one for the RCU_NEXT_READY_TAIL segment as started (ie: 8 + SRCU_STATE_SCAN1 = 9) so it returns a snapshot of the next grace period, which is 16. 5) The value of 16 is passed to rcu_segcblist_accelerate() but the freshly enqueued callback in RCU_NEXT_TAIL can't move to RCU_NEXT_READY_TAIL which already has callbacks for a previous grace period (gp_num = 12). So acceleration fails. 6) Note in all these steps, srcu_invoke_callbacks() hadn't had a chance to run srcu_invoke_callbacks(). Then some very bad outcome may happen if the following happens: 7) Some other CPU races and starts the grace period number 16 before the CPU handling previous steps had a chance. Therefore srcu_gp_start() isn't called on the latter sdp to fix the acceleration leak from previous steps with a new pair of call to advance/accelerate. 8) The grace period 16 completes and srcu_invoke_callbacks() is finally called. All the callbacks from previous grace periods (8 and 12) are correctly advanced and executed but callbacks in RCU_NEXT_READY_TAIL still remain. Then rcu_segcblist_accelerate() is called with a snaphot of 20. 9) Since nothing started the grace period number 20, callbacks stay unhandled. This has been reported in real load: [3144162.608392] INFO: task kworker/136:12:252684 blocked for more than 122 seconds. [3144162.615986] Tainted: G O K 5.4.203-1-tlinux4-0011.1 #1 [3144162.623053] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [3144162.631162] kworker/136:12 D 0 252684 2 0x90004000 [3144162.631189] Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm] [3144162.631192] Call Trace: [3144162.631202] __schedule+0x2ee/0x660 [3144162.631206] schedule+0x33/0xa0 [3144162.631209] schedule_timeout+0x1c4/0x340 [3144162.631214] ? update_load_avg+0x82/0x660 [3144162.631217] ? raw_spin_rq_lock_nested+0x1f/0x30 [3144162.631218] wait_for_completion+0x119/0x180 [3144162.631220] ? wake_up_q+0x80/0x80 [3144162.631224] __synchronize_srcu.part.19+0x81/0xb0 [3144162.631226] ? __bpf_trace_rcu_utilization+0x10/0x10 [3144162.631227] synchronize_srcu+0x5f/0xc0 [3144162.631236] irqfd_shutdown+0x3c/0xb0 [kvm] [3144162.631239] ? __schedule+0x2f6/0x660 [3144162.631243] process_one_work+0x19a/0x3a0 [3144162.631244] worker_thread+0x37/0x3a0 [3144162.631247] kthread+0x117/0x140 [3144162.631247] ? process_one_work+0x3a0/0x3a0 [3144162.631248] ? __kthread_cancel_work+0x40/0x40 [3144162.631250] ret_from_fork+0x1f/0x30 Fix this with taking the snapshot for acceleration _before_ the read of the current grace period number. The only side effect of this solution is that callbacks advancing happen then _after_ the full barrier in rcu_seq_snap(). This is not a problem because that barrier only cares about: 1) Ordering accesses of the update side before call_srcu() so they don't bleed. 2) See all the accesses prior to the grace period of the current gp_num The only things callbacks advancing need to be ordered against are carried by snp locking. Reported-by: Yong He Co-developed-by:: Yong He Signed-off-by: Yong He Co-developed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Co-developed-by: Neeraj upadhyay Signed-off-by: Neeraj upadhyay Link: http://lore.kernel.org/CANZk6aR+CqZaqmMWrC2eRRPY12qAZnDZLwLnHZbNi=xXMB401g@mail.gmail.com Fixes: da915ad5cf25 ("srcu: Parallelize callback handling") Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5602042856b1..9fab9ac36996 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1244,10 +1244,37 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); + /* + * The snapshot for acceleration must be taken _before_ the read of the + * current gp sequence used for advancing, otherwise advancing may fail + * and acceleration may then fail too. + * + * This could happen if: + * + * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the + * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). + * + * 2) The grace period for RCU_WAIT_TAIL is seen as started but not + * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1. + * + * 3) This value is passed to rcu_segcblist_advance() which can't move + * any segment forward and fails. + * + * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. + * But then the call to rcu_seq_snap() observes the grace period for the + * RCU_WAIT_TAIL segment as completed and the subsequent one for the + * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1) + * so it returns a snapshot of the next grace period, which is X + 12. + * + * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the + * freshly enqueued callback in RCU_NEXT_TAIL can't move to + * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace + * period (gp_num = X + 8). So acceleration fails. + */ + s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s) && rhp); if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; needgp = true; From cefe8ce559b5ff301f59b44f97c5c313ec30c643 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 28 Sep 2023 10:06:11 +0300 Subject: [PATCH 55/56] locktorture: Check the correct variable for allocation failure There is a typo so this checks the wrong variable. "chains" plural vs "chain" singular. We already know that "chains" is non-zero. Fixes: 7f993623e9eb ("locktorture: Add call_rcu_chains module parameter") Signed-off-by: Dan Carpenter Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/locking/locktorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index a3abcd136f56..69d3cd2cfc3b 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1075,7 +1075,7 @@ static int call_rcu_chain_init(void) if (call_rcu_chains <= 0) return 0; call_rcu_chain = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain), GFP_KERNEL); - if (!call_rcu_chains) + if (!call_rcu_chain) return -ENOMEM; for (i = 0; i < call_rcu_chains; i++) { call_rcu_chain[i].crc_stop = false; From 8a77f38bcd28d3c22ab7dd8eff3f299d43c00411 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Oct 2023 01:29:00 +0200 Subject: [PATCH 56/56] srcu: Only accelerate on enqueue time Acceleration in SRCU happens on enqueue time for each new callback. This operation is expected not to fail and therefore any similar attempt from other places shouldn't find any remaining callbacks to accelerate. Moreover accelerations performed beyond enqueue time are error prone because rcu_seq_snap() then may return the snapshot for a new grace period that is not going to be started. Remove these dangerous and needless accelerations and introduce instead assertions reporting leaking unaccelerated callbacks beyond enqueue time. Co-developed-by: Yong He Signed-off-by: Yong He Co-developed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Co-developed-by: Neeraj upadhyay Signed-off-by: Neeraj upadhyay Reviewed-by: Like Xu Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9fab9ac36996..560e99ec5333 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -784,8 +784,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); + WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); @@ -1721,6 +1720,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); + WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || @@ -1750,8 +1750,6 @@ static void srcu_invoke_callbacks(struct work_struct *work) */ spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp);