From b58e733fd774f3f4b49d9e7640d172a57e35200e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Jun 2020 18:24:27 +0200 Subject: rcu: Fixup noinstr warnings A KCSAN build revealed we have explicit annoations through atomic_*() usage, switch to arch_atomic_*() for the respective functions. vmlinux.o: warning: objtool: rcu_nmi_exit()+0x4d: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: rcu_dynticks_eqs_enter()+0x25: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: rcu_nmi_enter()+0x4f: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: rcu_dynticks_eqs_exit()+0x2a: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: __rcu_is_watching()+0x25: call to __kcsan_check_access() leaves .noinstr.text section Additionally, without the NOP in instrumentation_begin(), objtool would not detect the lack of the 'else instrumentation_begin();' branch in rcu_nmi_enter(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c716eadc7617..6c6569e0586c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -250,7 +250,7 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); @@ -274,13 +274,13 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { - atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); + arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ } } @@ -313,7 +313,7 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); } /* @@ -633,6 +633,10 @@ static noinstr void rcu_eqs_enter(bool user) do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... @@ -692,6 +696,7 @@ noinstr void rcu_nmi_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + instrumentation_begin(); /* * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. * (We are exiting an NMI handler, so RCU better be paying attention @@ -705,7 +710,6 @@ noinstr void rcu_nmi_exit(void) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { - instrumentation_begin(); trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ @@ -714,13 +718,15 @@ noinstr void rcu_nmi_exit(void) return; } - instrumentation_begin(); /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (!in_nmi()) rcu_prepare_for_idle(); + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); instrumentation_end(); // RCU is watching here ... @@ -838,6 +844,10 @@ static void noinstr rcu_eqs_exit(bool user) rcu_dynticks_eqs_exit(); // ... but is watching here. instrumentation_begin(); + + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); @@ -983,13 +993,21 @@ noinstr void rcu_nmi_enter(void) if (!in_nmi()) rcu_cleanup_after_idle(); + instrumentation_begin(); + // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() + instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + incby = 1; } else if (!in_nmi()) { instrumentation_begin(); rcu_irq_enter_check_tick(); instrumentation_end(); + } else { + instrumentation_begin(); } - instrumentation_begin(); + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); -- cgit v1.2.3 From abfce0414814149f716e1d30da1fb3140d1b3473 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 19 Apr 2020 21:57:15 +0000 Subject: rcu: Simplify the calculation of rcu_state.ncpus There is only 1 bit set in mask, which means that the only difference between oldmask and the new one will be at the position where the bit is set in mask. This commit therefore updates rcu_state.ncpus by checking whether the bit in mask is already set in rnp->expmaskinitnext. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..bef1dc91bfbe 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3842,10 +3842,9 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; - int nbits; - unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; + bool newcpu; if (per_cpu(rcu_cpu_started, cpu)) return; @@ -3857,12 +3856,10 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); - oldmask = rnp->expmaskinitnext; + newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; - oldmask ^= rnp->expmaskinitnext; - nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ - smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */ ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); -- cgit v1.2.3 From e816d56fad57ba9817cef6606b12f5e14647c3bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 May 2020 16:49:48 -0700 Subject: rcu: Add callbacks-invoked counters This commit adds a count of the callbacks invoked to the per-CPU rcu_data structure. This count is printed by the show_rcu_gp_kthreads() that is invoked by rcutorture and the RCU CPU stall-warning code. It is also intended for use by drgn. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree.h | 1 + kernel/rcu/tree_stall.h | 3 +++ 3 files changed, 5 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bef1dc91bfbe..874c831bcc45 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2443,6 +2443,7 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); count = -rcl.len; + rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 43991a40b084..9c6f7343bec0 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -171,6 +171,7 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 54a6dba0280d..2768ce6bf657 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -649,6 +649,7 @@ static void check_cpu_stall(struct rcu_data *rdp) */ void show_rcu_gp_kthreads(void) { + unsigned long cbs = 0; int cpu; unsigned long j; unsigned long ja; @@ -690,9 +691,11 @@ void show_rcu_gp_kthreads(void) } for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); + cbs += data_race(rdp->n_cbs_invoked); if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } + pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); -- cgit v1.2.3 From 77865dea25c4f45ce0c5bf61a8470af01fccd944 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 15:44:46 -0700 Subject: rcu: Grace-period-kthread related sleeps to idle priority This commit converts the long-standing schedule_timeout_interruptible() and schedule_timeout_uninterruptible() calls used by RCU's grace-period kthread to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 874c831bcc45..feb31c201dee 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1638,7 +1638,7 @@ static void rcu_gp_slow(int delay) if (delay > 0 && !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) - schedule_timeout_uninterruptible(delay); + schedule_timeout_idle(delay); } static unsigned long sleep_duration; @@ -1661,7 +1661,7 @@ static void rcu_gp_torture_wait(void) duration = xchg(&sleep_duration, 0UL); if (duration > 0) { pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); - schedule_timeout_uninterruptible(duration); + schedule_timeout_idle(duration); pr_alert("%s: Wait complete\n", __func__); } } @@ -2727,7 +2727,7 @@ static void rcu_cpu_kthread(unsigned int cpu) } *statusp = RCU_KTHREAD_YIELDING; trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; } -- cgit v1.2.3 From 04b25a495bd68c1dad07263fb91e8b5a31c00a9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 May 2020 17:00:54 -0700 Subject: rcu: Mark rcu_nmi_enter() call to rcu_cleanup_after_idle() noinstr The objtool complains about the call to rcu_cleanup_after_idle() from rcu_nmi_enter(), so this commit adds instrumentation_begin() before that call and instrumentation_end() after it. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index feb31c201dee..d17e5a08bf43 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -990,8 +990,11 @@ noinstr void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); // ... but is watching here. - if (!in_nmi()) + if (!in_nmi()) { + instrumentation_begin(); rcu_cleanup_after_idle(); + instrumentation_end(); + } instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() -- cgit v1.2.3 From c6dfd72b7a3b70a2054db0f73245ea2f762a8452 Mon Sep 17 00:00:00 2001 From: Peter Enderborg Date: Thu, 4 Jun 2020 12:23:20 +0200 Subject: rcu: Stop shrinker loop The count and scan can be separated in time, and there is a fair chance that all work is already done when the scan starts, which might in turn result in a needless retry. This commit therefore avoids this retry by returning SHRINK_STOP. Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Peter Enderborg Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d17e5a08bf43..c8196fab563c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3332,7 +3332,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - return freed; + return freed == 0 ? SHRINK_STOP : freed; } static struct shrinker kfree_rcu_shrinker = { -- cgit v1.2.3 From c3cb47a6cc74af0b79579ba167d7124eb669fbaa Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Jun 2020 12:28:05 -0700 Subject: kernel/rcu/tree.c: Fix kernel-doc warnings Fix kernel-doc warning: ../kernel/rcu/tree.c:959: warning: Excess function parameter 'irq' description in 'rcu_nmi_enter' Fixes: cf7614e13c8f ("rcu: Refactor rcu_{nmi,irq}_{enter,exit}()") Signed-off-by: Randy Dunlap Cc: Byungchul Park Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c8196fab563c..ef05aac7f9d3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -954,7 +954,6 @@ void __rcu_irq_enter_check_tick(void) /** * rcu_nmi_enter - inform RCU of entry to NMI context - * @irq: Is this call from rcu_irq_enter? * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know -- cgit v1.2.3 From 8e11690d2f5a9823d66f68918c3986b4e9e160ab Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 4 May 2020 14:35:00 +0200 Subject: rcu: Fix a kernel-doc warnings for "count" There are some kernel-doc warnings: ./kernel/rcu/tree.c:2915: warning: Function parameter or member 'count' not described in 'kfree_rcu_cpu' This commit therefore moves the comment for "count" to the kernel-doc markup. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..ba4c477495b5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3004,6 +3004,7 @@ struct kfree_rcu_cpu_work { * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @lock and @rcu_work fields have been initialized + * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3019,7 +3020,6 @@ struct kfree_rcu_cpu { struct delayed_work monitor_work; bool monitor_todo; bool initialized; - // Number of objects for which GP not started int count; }; -- cgit v1.2.3 From 8ac88f7177c75bf9b7b8c29a8054115e1c712baf Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:45 +0200 Subject: rcu/tree: Keep kfree_rcu() awake during lock contention On PREEMPT_RT kernels, the krcp spinlock gets converted to an rt-mutex and causes kfree_rcu() callers to sleep. This makes it unusable for callers in purely atomic sections such as non-threaded IRQ handlers and raw spinlock sections. Fix it by converting the spinlock to a raw spinlock. Vetting all code paths, there is no reason to believe that the raw spinlock will hurt RT latencies as it is not held for a long time. Cc: bigeasy@linutronix.de Cc: Uladzislau Rezki Reviewed-by: Uladzislau Rezki Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ba4c477495b5..c5de5adca0dd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3016,7 +3016,7 @@ struct kfree_rcu_cpu { struct kfree_rcu_bulk_data *bhead; struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - spinlock_t lock; + raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; @@ -3049,12 +3049,12 @@ static void kfree_rcu_work(struct work_struct *work) krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); head = krwp->head_free; krwp->head_free = NULL; bhead = krwp->bhead_free; krwp->bhead_free = NULL; - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); /* "bhead" is now private, so traverse locklessly. */ for (; bhead; bhead = bnext) { @@ -3157,14 +3157,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); return; } // Previous RCU batch still in progress, try again later. krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } /* @@ -3177,11 +3177,11 @@ static void kfree_rcu_monitor(struct work_struct *work) struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, monitor_work.work); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static inline bool @@ -3252,7 +3252,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) - spin_lock(&krcp->lock); + raw_spin_lock(&krcp->lock); // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(head)) { @@ -3283,7 +3283,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: if (krcp->initialized) - spin_unlock(&krcp->lock); + raw_spin_unlock(&krcp->lock); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -3315,11 +3315,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count = krcp->count; - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); sc->nr_to_scan -= count; freed += count; @@ -3346,15 +3346,15 @@ void __init kfree_rcu_scheduler_running(void) for_each_online_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (!krcp->head || krcp->monitor_todo) { - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); continue; } krcp->monitor_todo = true; schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -4250,7 +4250,7 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_init(&krcp->lock); + raw_spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; -- cgit v1.2.3 From 4d2919411867848fab78c7cb13139e17ad8b85bc Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:46 +0200 Subject: rcu/tree: Skip entry into the page allocator for PREEMPT_RT To keep the kfree_rcu() code working in purely atomic sections on RT, such as non-threaded IRQ handlers and raw spinlock sections, avoid calling into the page allocator which uses sleeping locks on RT. In fact, even if the caller is preemptible, the kfree_rcu() code is not, as the krcp->lock is a raw spinlock. Calling into the page allocator is optional and avoiding it should be Ok, especially with the page pre-allocation support in future patches. Such pre-allocation would further avoid the a need for a dynamically allocated page in the first place. Cc: Sebastian Andrzej Siewior Reviewed-by: Uladzislau Rezki Co-developed-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c5de5adca0dd..e0425faf3b3b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3202,6 +3202,18 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, if (!bnode) { WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); + /* + * To keep this path working on raw non-preemptible + * sections, prevent the optional entry into the + * allocator as it uses sleeping locks. In fact, even + * if the caller of kfree_rcu() is preemptible, this + * path still is not, as krcp->lock is a raw spinlock. + * With additional page pre-allocation in the works, + * hitting this return is going to be much less likely. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + bnode = (struct kfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } -- cgit v1.2.3 From 594aa5975b9b5cfe9edaec06170e43b8c0607377 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:47 +0200 Subject: rcu/tree: Repeat the monitor if any free channel is busy It is possible that one of the channels cannot be detached because its free channel is busy and previously queued data has not been processed yet. On the other hand, another channel can be successfully detached causing the monitor work to stop. Prevent that by rescheduling the monitor work if there are any channels in the pending state after a detach attempt. Fixes: 34c881745549e ("rcu: Support kfree_bulk() interface in kfree_rcu()") Acked-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e0425faf3b3b..5151fe4e1429 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3105,7 +3105,7 @@ static void kfree_rcu_work(struct work_struct *work) static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; - bool queued = false; + bool repeat = false; int i; lockdep_assert_held(&krcp->lock); @@ -3143,11 +3143,14 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) * been detached following each other, one by one. */ queue_rcu_work(system_wq, &krwp->rcu_work); - queued = true; } + + /* Repeat if any "free" corresponding channel is still busy. */ + if (krcp->bhead || krcp->head) + repeat = true; } - return queued; + return !repeat; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, -- cgit v1.2.3 From 446044eb9c9c335d3ae1be4665193ab43ebb284e Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:48 +0200 Subject: rcu/tree: Make debug_objects logic independent of rcu_head kfree_rcu()'s debug_objects logic uses the address of the object's embedded rcu_head to queue/unqueue. Instead of this, make use of the object's address itself as preparation for future headless kfree_rcu() support. Reviewed-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5151fe4e1429..143c1e9265b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2970,13 +2970,11 @@ EXPORT_SYMBOL_GPL(call_rcu); * @nr_records: Number of active pointers in the array * @records: Array of the kfree_rcu() pointers * @next: Next bulk object in the block chain - * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set */ struct kfree_rcu_bulk_data { unsigned long nr_records; void *records[KFREE_BULK_MAX_ENTR]; struct kfree_rcu_bulk_data *next; - struct rcu_head *head_free_debug; }; /** @@ -3026,11 +3024,13 @@ struct kfree_rcu_cpu { static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); static __always_inline void -debug_rcu_head_unqueue_bulk(struct rcu_head *head) +debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - for (; head; head = head->next) - debug_rcu_head_unqueue(head); + int i; + + for (i = 0; i < bhead->nr_records; i++) + debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); #endif } @@ -3060,7 +3060,7 @@ static void kfree_rcu_work(struct work_struct *work) for (; bhead; bhead = bnext) { bnext = bhead->next; - debug_rcu_head_unqueue_bulk(bhead->head_free_debug); + debug_rcu_bhead_unqueue(bhead); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, @@ -3082,14 +3082,15 @@ static void kfree_rcu_work(struct work_struct *work) */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; + void *ptr = (void *)head - offset; next = head->next; - debug_rcu_head_unqueue(head); + debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree((void *)head - offset); + kfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -3228,18 +3229,11 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; bnode->next = krcp->bhead; - bnode->head_free_debug = NULL; /* Attach it to the head. */ krcp->bhead = bnode; } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - head->func = func; - head->next = krcp->bhead->head_free_debug; - krcp->bhead->head_free_debug = head; -#endif - /* Finally insert. */ krcp->bhead->records[krcp->bhead->nr_records++] = (void *) head - (unsigned long) func; @@ -3263,14 +3257,17 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + void *ptr; local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) raw_spin_lock(&krcp->lock); + ptr = (void *)head - (unsigned long)func; + // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(head)) { + if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); -- cgit v1.2.3 From 3af84862817403d317dc33312e7a88d76e79401a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:49 +0200 Subject: rcu/tree: Simplify KFREE_BULK_MAX_ENTR macro We can simplify KFREE_BULK_MAX_ENTR macro and get rid of magic numbers which were used to make the structure to be exactly one page. Suggested-by: Boqun Feng Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 143c1e9265b6..bcdc06364426 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2958,13 +2958,6 @@ EXPORT_SYMBOL_GPL(call_rcu); #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 -/* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3) - /** * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers * @nr_records: Number of active pointers in the array @@ -2973,10 +2966,18 @@ EXPORT_SYMBOL_GPL(call_rcu); */ struct kfree_rcu_bulk_data { unsigned long nr_records; - void *records[KFREE_BULK_MAX_ENTR]; struct kfree_rcu_bulk_data *next; + void *records[]; }; +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *)) + /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period -- cgit v1.2.3 From 952371d6fc0bc360d1d5780f86bb355836117ca2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:50 +0200 Subject: rcu/tree: Move kfree_rcu_cpu locking/unlocking to separate functions Introduce helpers to lock and unlock per-cpu "kfree_rcu_cpu" structures. That will make kfree_call_rcu() more readable and prevent programming errors. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcdc06364426..368bdc441ffb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3035,6 +3035,27 @@ debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) #endif } +static inline struct kfree_rcu_cpu * +krc_this_cpu_lock(unsigned long *flags) +{ + struct kfree_rcu_cpu *krcp; + + local_irq_save(*flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + if (likely(krcp->initialized)) + raw_spin_lock(&krcp->lock); + + return krcp; +} + +static inline void +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) +{ + if (likely(krcp->initialized)) + raw_spin_unlock(&krcp->lock); + local_irq_restore(flags); +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3260,11 +3281,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) struct kfree_rcu_cpu *krcp; void *ptr; - local_irq_save(flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - if (krcp->initialized) - raw_spin_lock(&krcp->lock); - + krcp = krc_this_cpu_lock(&flags); ptr = (void *)head - (unsigned long)func; // Queue the object but don't yet schedule the batch. @@ -3295,9 +3312,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } unlock_return: - if (krcp->initialized) - raw_spin_unlock(&krcp->lock); - local_irq_restore(flags); + krc_this_cpu_unlock(krcp, flags); } EXPORT_SYMBOL_GPL(kfree_call_rcu); -- cgit v1.2.3 From 69f08d3999dbef1553a3332b8055282dd3893b6c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 25 May 2020 23:47:51 +0200 Subject: rcu/tree: Use static initializer for krc.lock The per-CPU variable is initialized at runtime in kfree_rcu_batch_init(). This function is invoked before 'rcu_scheduler_active' is set to 'RCU_SCHEDULER_RUNNING'. After the initialisation, '->initialized' is to true. The raw_spin_lock is only acquired if '->initialized' is set to true. The worqueue item is only used if 'rcu_scheduler_active' set to RCU_SCHEDULER_RUNNING which happens after initialisation. Use a static initializer for krc.lock and remove the runtime initialisation of the lock. Since the lock can now be always acquired, remove the '->initialized' check. Cc: Sebastian Andrzej Siewior Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 368bdc441ffb..a42a4693f161 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3002,7 +3002,7 @@ struct kfree_rcu_cpu_work { * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending - * @initialized: The @lock and @rcu_work fields have been initialized + * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in @@ -3022,7 +3022,9 @@ struct kfree_rcu_cpu { int count; }; -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), +}; static __always_inline void debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) @@ -3042,8 +3044,7 @@ krc_this_cpu_lock(unsigned long *flags) local_irq_save(*flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); - if (likely(krcp->initialized)) - raw_spin_lock(&krcp->lock); + raw_spin_lock(&krcp->lock); return krcp; } @@ -3051,8 +3052,7 @@ krc_this_cpu_lock(unsigned long *flags) static inline void krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { - if (likely(krcp->initialized)) - raw_spin_unlock(&krcp->lock); + raw_spin_unlock(&krcp->lock); local_irq_restore(flags); } @@ -4278,7 +4278,6 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - raw_spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; -- cgit v1.2.3 From 53c72b590b3a0afd6747d6f7957e6838003e90a4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:52 +0200 Subject: rcu/tree: cache specified number of objects In order to reduce the dynamic need for pages in kfree_rcu(), pre-allocate a configurable number of pages per CPU and link them in a list. When kfree_rcu() reclaims objects, the object's container page is cached into a list instead of being released to the low-level page allocator. Such an approach provides O(1) access to free pages while also reducing the number of requests to the page allocator. It also makes the kfree_rcu() code to have free pages available during a low memory condition. A read-only sysfs parameter (rcu_min_cached_objs) reflects the minimum number of allowed cached pages per CPU. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 8 +++ kernel/rcu/tree.c | 66 +++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 4 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..befaa63652ff 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4038,6 +4038,14 @@ latencies, which will choose a value aligned with the appropriate hardware boundaries. + rcutree.rcu_min_cached_objs= [KNL] + Minimum number of objects which are cached and + maintained per one CPU. Object size is equal + to PAGE_SIZE. The cache allows to reduce the + pressure to page allocator, also it makes the + whole algorithm to behave better in low memory + condition. + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a42a4693f161..37c0cd0332f8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -175,6 +175,15 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* + * This rcu parameter is runtime-read-only. It reflects + * a minimum allowed number of objects which can be cached + * per-CPU. Object size is equal to one page. This value + * can be changed at boot time. + */ +static int rcu_min_cached_objs = 2; +module_param(rcu_min_cached_objs, int, 0444); + /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -2997,7 +3006,6 @@ struct kfree_rcu_cpu_work { * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period - * @bcached: Keeps at most one object for later reuse when build chain blocks * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -3013,13 +3021,22 @@ struct kfree_rcu_cpu_work { struct kfree_rcu_cpu { struct rcu_head *head; struct kfree_rcu_bulk_data *bhead; - struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; int count; + + /* + * A simple cache list that contains objects for + * reuse purpose. In order to save some per-cpu + * space the list is singular. Even though it is + * lockless an access has to be protected by the + * per-cpu lock. + */ + struct llist_head bkvcache; + int nr_bkv_objs; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { @@ -3056,6 +3073,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) local_irq_restore(flags); } +static inline struct kfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + krcp->nr_bkv_objs--; + return (struct kfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kfree_rcu_bulk_data *bnode) +{ + // Check the limit. + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + krcp->nr_bkv_objs++; + return true; + +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3091,7 +3133,12 @@ static void kfree_rcu_work(struct work_struct *work) kfree_bulk(bhead->nr_records, bhead->records); rcu_lock_release(&rcu_callback_map); - if (cmpxchg(&krcp->bcached, NULL, bhead)) + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bhead)) + bhead = NULL; + krc_this_cpu_unlock(krcp, flags); + + if (bhead) free_page((unsigned long) bhead); cond_resched_tasks_rcu_qs(); @@ -3224,7 +3271,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Check if a new block is required. */ if (!krcp->bhead || krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { - bnode = xchg(&krcp->bcached, NULL); + bnode = get_cached_bnode(krcp); if (!bnode) { WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); @@ -4277,12 +4324,23 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + struct kfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (bnode) + put_cached_bnode(krcp, bnode); + else + pr_err("Failed to preallocate for %d CPU!\n", cpu); + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } -- cgit v1.2.3 From 5f3c8d620447d509e534962e23f7edfb85f4e533 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:53 +0200 Subject: rcu/tree: Maintain separate array for vmalloc ptrs To do so, we use an array of kvfree_rcu_bulk_data structures. It consists of two elements: - index number 0 corresponds to slab pointers. - index number 1 corresponds to vmalloc pointers. Keeping vmalloc pointers separated from slab pointers makes it possible to invoke the right freeing API for the right kind of pointer. It also prepares us for future headless support for vmalloc and SLAB objects. Such objects cannot be queued on a linked list and are instead directly into an array. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Reviewed-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 173 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 100 insertions(+), 73 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 37c0cd0332f8..67c4b984c499 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include "../time/tick-internal.h" #include "tree.h" @@ -2966,46 +2968,47 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 +#define FREE_N_CHANNELS 2 /** - * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers + * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers * @nr_records: Number of active pointers in the array - * @records: Array of the kfree_rcu() pointers * @next: Next bulk object in the block chain + * @records: Array of the kvfree_rcu() pointers */ -struct kfree_rcu_bulk_data { +struct kvfree_rcu_bulk_data { unsigned long nr_records; - struct kfree_rcu_bulk_data *next; + struct kvfree_rcu_bulk_data *next; void *records[]; }; /* * This macro defines how many entries the "records" array * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. + * kvfree_rcu_bulk_data structure becomes exactly one page. */ -#define KFREE_BULK_MAX_ENTR \ - ((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *)) +#define KVFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period + * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; - struct kfree_rcu_bulk_data *bhead_free; + struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period + * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -3020,7 +3023,7 @@ struct kfree_rcu_cpu_work { */ struct kfree_rcu_cpu { struct rcu_head *head; - struct kfree_rcu_bulk_data *bhead; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; @@ -3044,7 +3047,7 @@ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { }; static __always_inline void -debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) +debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD int i; @@ -3073,20 +3076,20 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) local_irq_restore(flags); } -static inline struct kfree_rcu_bulk_data * +static inline struct kvfree_rcu_bulk_data * get_cached_bnode(struct kfree_rcu_cpu *krcp) { if (!krcp->nr_bkv_objs) return NULL; krcp->nr_bkv_objs--; - return (struct kfree_rcu_bulk_data *) + return (struct kvfree_rcu_bulk_data *) llist_del_first(&krcp->bkvcache); } static inline bool put_cached_bnode(struct kfree_rcu_cpu *krcp, - struct kfree_rcu_bulk_data *bnode) + struct kvfree_rcu_bulk_data *bnode) { // Check the limit. if (krcp->nr_bkv_objs >= rcu_min_cached_objs) @@ -3105,43 +3108,63 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp, static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; struct rcu_head *head, *next; - struct kfree_rcu_bulk_data *bhead, *bnext; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + int i, j; krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; + raw_spin_lock_irqsave(&krcp->lock, flags); + // Channels 1 and 2. + for (i = 0; i < FREE_N_CHANNELS; i++) { + bkvhead[i] = krwp->bkvhead_free[i]; + krwp->bkvhead_free[i] = NULL; + } + + // Channel 3. head = krwp->head_free; krwp->head_free = NULL; - bhead = krwp->bhead_free; - krwp->bhead_free = NULL; raw_spin_unlock_irqrestore(&krcp->lock, flags); - /* "bhead" is now private, so traverse locklessly. */ - for (; bhead; bhead = bnext) { - bnext = bhead->next; - - debug_rcu_bhead_unqueue(bhead); - - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, - bhead->nr_records, bhead->records); - - kfree_bulk(bhead->nr_records, bhead->records); - rcu_lock_release(&rcu_callback_map); + // Handle two first channels. + for (i = 0; i < FREE_N_CHANNELS; i++) { + for (; bkvhead[i]; bkvhead[i] = bnext) { + bnext = bkvhead[i]->next; + debug_rcu_bhead_unqueue(bkvhead[i]); + + rcu_lock_acquire(&rcu_callback_map); + if (i == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bkvhead[i]->nr_records, + bkvhead[i]->records); + + kfree_bulk(bkvhead[i]->nr_records, + bkvhead[i]->records); + } else { // vmalloc() / vfree(). + for (j = 0; j < bkvhead[i]->nr_records; j++) { + trace_rcu_invoke_kfree_callback( + rcu_state.name, + bkvhead[i]->records[j], 0); + + vfree(bkvhead[i]->records[j]); + } + } + rcu_lock_release(&rcu_callback_map); - krcp = krc_this_cpu_lock(&flags); - if (put_cached_bnode(krcp, bhead)) - bhead = NULL; - krc_this_cpu_unlock(krcp, flags); + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bkvhead[i])) + bkvhead[i] = NULL; + krc_this_cpu_unlock(krcp, flags); - if (bhead) - free_page((unsigned long) bhead); + if (bkvhead[i]) + free_page((unsigned long) bkvhead[i]); - cond_resched_tasks_rcu_qs(); + cond_resched_tasks_rcu_qs(); + } } /* @@ -3159,7 +3182,7 @@ static void kfree_rcu_work(struct work_struct *work) trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree(ptr); + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -3176,7 +3199,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; bool repeat = false; - int i; + int i, j; lockdep_assert_held(&krcp->lock); @@ -3184,21 +3207,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krwp = &(krcp->krw_arr[i]); /* - * Try to detach bhead or head and attach it over any + * Try to detach bkvhead or head and attach it over any * available corresponding free channel. It can be that * a previous RCU batch is in progress, it means that * immediately to queue another one is not possible so * return false to tell caller to retry. */ - if ((krcp->bhead && !krwp->bhead_free) || + if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || + (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || (krcp->head && !krwp->head_free)) { - /* Channel 1. */ - if (!krwp->bhead_free) { - krwp->bhead_free = krcp->bhead; - krcp->bhead = NULL; + // Channel 1 corresponds to SLAB ptrs. + // Channel 2 corresponds to vmalloc ptrs. + for (j = 0; j < FREE_N_CHANNELS; j++) { + if (!krwp->bkvhead_free[j]) { + krwp->bkvhead_free[j] = krcp->bkvhead[j]; + krcp->bkvhead[j] = NULL; + } } - /* Channel 2. */ + // Channel 3 corresponds to emergency path. if (!krwp->head_free) { krwp->head_free = krcp->head; krcp->head = NULL; @@ -3207,16 +3234,17 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) WRITE_ONCE(krcp->count, 0); /* - * One work is per one batch, so there are two "free channels", - * "bhead_free" and "head_free" the batch can handle. It can be - * that the work is in the pending state when two channels have - * been detached following each other, one by one. + * One work is per one batch, so there are three + * "free channels", the batch can handle. It can + * be that the work is in the pending state when + * channels have been detached following by each + * other. */ queue_rcu_work(system_wq, &krwp->rcu_work); } - /* Repeat if any "free" corresponding channel is still busy. */ - if (krcp->bhead || krcp->head) + // Repeat if any "free" corresponding channel is still busy. + if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head) repeat = true; } @@ -3258,23 +3286,22 @@ static void kfree_rcu_monitor(struct work_struct *work) } static inline bool -kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, - struct rcu_head *head, rcu_callback_t func) +kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) { - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; + int idx; if (unlikely(!krcp->initialized)) return false; lockdep_assert_held(&krcp->lock); + idx = !!is_vmalloc_addr(ptr); /* Check if a new block is required. */ - if (!krcp->bhead || - krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { + if (!krcp->bkvhead[idx] || + krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { bnode = get_cached_bnode(krcp); if (!bnode) { - WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); - /* * To keep this path working on raw non-preemptible * sections, prevent the optional entry into the @@ -3287,7 +3314,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, if (IS_ENABLED(CONFIG_PREEMPT_RT)) return false; - bnode = (struct kfree_rcu_bulk_data *) + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3297,30 +3324,30 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; - bnode->next = krcp->bhead; + bnode->next = krcp->bkvhead[idx]; /* Attach it to the head. */ - krcp->bhead = bnode; + krcp->bkvhead[idx] = bnode; } /* Finally insert. */ - krcp->bhead->records[krcp->bhead->nr_records++] = - (void *) head - (unsigned long) func; + krcp->bkvhead[idx]->records + [krcp->bkvhead[idx]->nr_records++] = ptr; return true; } /* - * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace - * period. Please note there are two paths are maintained, one is the main one - * that uses kfree_bulk() interface and second one is emergency one, that is - * used only when the main path can not be maintained temporary, due to memory - * pressure. + * Queue a request for lazy invocation of appropriate free routine after a + * grace period. Please note there are three paths are maintained, two are the + * main ones that use array of pointers interface and third one is emergency + * one, that is used only when the main path can not be maintained temporary, + * due to memory pressure. * * Each kfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu() load. + * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3343,7 +3370,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { + if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) { head->func = func; head->next = krcp->head; krcp->head = head; @@ -4324,7 +4351,7 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); @@ -4332,7 +4359,7 @@ static void __init kfree_rcu_batch_init(void) } for (i = 0; i < rcu_min_cached_objs; i++) { - bnode = (struct kfree_rcu_bulk_data *) + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (bnode) -- cgit v1.2.3 From c408b215f58f7156bb6bafb64c0263ee907033df Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:55 +0200 Subject: rcu: Rename *_kfree_callback/*_kfree_rcu_offset/kfree_call_* The following changes are introduced: 1. Rename rcu_invoke_kfree_callback() to rcu_invoke_kvfree_callback(), as well as the associated trace events, so the rcu_kfree_callback(), becomes rcu_kvfree_callback(). The reason is to be aligned with kvfree() notation. 2. Rename __is_kfree_rcu_offset to __is_kvfree_rcu_offset. All RCU paths use kvfree() now instead of kfree(), thus rename it. 3. Rename kfree_call_rcu() to the kvfree_call_rcu(). The reason is, it is capable of freeing vmalloc() memory now. Do the same with __kfree_rcu() macro, it becomes __kvfree_rcu(), the goal is the same. Reviewed-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 14 +++++++------- include/linux/rcutiny.h | 2 +- include/linux/rcutree.h | 2 +- include/trace/events/rcu.h | 8 ++++---- kernel/rcu/tiny.c | 4 ++-- kernel/rcu/tree.c | 16 ++++++++-------- 6 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 659cbfa7581a..b344fc800a9b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -828,17 +828,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /* * Does the specified offset indicate that the corresponding rcu_head - * structure can be handled by kfree_rcu()? + * structure can be handled by kvfree_rcu()? */ -#define __is_kfree_rcu_offset(offset) ((offset) < 4096) +#define __is_kvfree_rcu_offset(offset) ((offset) < 4096) /* * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. */ -#define __kfree_rcu(head, offset) \ +#define __kvfree_rcu(head, offset) \ do { \ - BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \ - kfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ + BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \ + kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ } while (0) /** @@ -857,7 +857,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * Because the functions are not allowed in the low-order 4096 bytes of * kernel virtual memory, offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will - * be generated in __kfree_rcu(). If this error is triggered, you can + * be generated in __kvfree_rcu(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * @@ -872,7 +872,7 @@ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) \ - __kfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ + __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ } while (0) /* diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 8512caeb7682..fb2eb39c484f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -34,7 +34,7 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } -static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { call_rcu(head, func); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d5cc9d675987..d2f4064ebd1d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -33,7 +33,7 @@ static inline void rcu_virt_note_context_switch(int cpu) } void synchronize_rcu_expedited(void); -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func); +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index f9a7811148e2..0ee93d0b1daa 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -506,13 +506,13 @@ TRACE_EVENT_RCU(rcu_callback, /* * Tracepoint for the registration of a single RCU callback of the special - * kfree() form. The first argument is the RCU type, the second argument + * kvfree() form. The first argument is the RCU type, the second argument * is a pointer to the RCU callback, the third argument is the offset * of the callback within the enclosing RCU-protected data structure, * the fourth argument is the number of lazy callbacks queued, and the * fifth argument is the total number of callbacks queued. */ -TRACE_EVENT_RCU(rcu_kfree_callback, +TRACE_EVENT_RCU(rcu_kvfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, long qlen), @@ -596,12 +596,12 @@ TRACE_EVENT_RCU(rcu_invoke_callback, /* * Tracepoint for the invocation of a single RCU callback of the special - * kfree() form. The first argument is the RCU flavor, the second + * kvfree() form. The first argument is the RCU flavor, the second * argument is a pointer to the RCU callback, and the third argument * is the offset of the callback within the enclosing RCU-protected * data structure. */ -TRACE_EVENT_RCU(rcu_invoke_kfree_callback, +TRACE_EVENT_RCU(rcu_invoke_kvfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset), diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b99f7b88bee..aa897c3f2e92 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,8 +85,8 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback("", head, offset); + if (__is_kvfree_rcu_offset(offset)) { + trace_rcu_invoke_kvfree_callback("", head, offset); kvfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 67c4b984c499..f22c47e72287 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2905,8 +2905,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) return; // Enqueued onto ->nocb_bypass, so just leave. // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rcu_state.name, head, + if (__is_kvfree_rcu_offset((unsigned long)func)) + trace_rcu_kvfree_callback(rcu_state.name, head, (unsigned long)func, rcu_segcblist_n_cbs(&rdp->cblist)); else @@ -3146,7 +3146,7 @@ static void kfree_rcu_work(struct work_struct *work) bkvhead[i]->records); } else { // vmalloc() / vfree(). for (j = 0; j < bkvhead[i]->nr_records; j++) { - trace_rcu_invoke_kfree_callback( + trace_rcu_invoke_kvfree_callback( rcu_state.name, bkvhead[i]->records[j], 0); @@ -3179,9 +3179,9 @@ static void kfree_rcu_work(struct work_struct *work) next = head->next; debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) kvfree(ptr); rcu_lock_release(&rcu_callback_map); @@ -3344,12 +3344,12 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) * one, that is used only when the main path can not be maintained temporary, * due to memory pressure. * - * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; @@ -3388,7 +3388,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: krc_this_cpu_unlock(krcp, flags); } -EXPORT_SYMBOL_GPL(kfree_call_rcu); +EXPORT_SYMBOL_GPL(kvfree_call_rcu); static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) -- cgit v1.2.3 From 3042f83f19bec2e0cd356f72b39e4d816e8cd5ff Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:58 +0200 Subject: rcu: Support reclaim for head-less object Update the kvfree_call_rcu() function with head-less support. This allows RCU to reclaim objects without an embedded rcu_head. tree-RCU: We introduce two chains of arrays to store SLAB-backed and vmalloc pointers, each. Storage in either of these arrays does not require embedding an rcu_head within the object. Maintaining the arrays may become impossible due to high memory pressure. For such cases there is an emergency path. Objects with rcu_head inside are just queued on a backup rcu_head list. Later on that list is drained. As for the head-less variant, as the current context can sleep, the following emergency measures are applied: a) Synchronously wait until a grace period has elapsed. b) Call kvfree(). tiny-RCU: For double argument calls, there are no new changes in behavior. For single argument call, kvfree() is directly inlined on the current stack after a synchronize_rcu() call. Note that for tiny-RCU, any call to synchronize_rcu() is actually a quiescent state, therefore it does nothing. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 18 +++++++++++++++++- kernel/rcu/tree.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index fb2eb39c484f..5cc9637cac16 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -34,9 +34,25 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } +/* + * Add one more declaration of kvfree() here. It is + * not so straight forward to just include + * where it is defined due to getting many compile + * errors caused by that include. + */ +extern void kvfree(const void *addr); + static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { - call_rcu(head, func); + if (head) { + call_rcu(head, func); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree((void *) func); } void rcu_qs(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f22c47e72287..01f29e4500ba 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3314,6 +3314,13 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) if (IS_ENABLED(CONFIG_PREEMPT_RT)) return false; + /* + * NOTE: For one argument of kvfree_rcu() we can + * drop the lock and get the page in sleepable + * context. That would allow to maintain an array + * for the CONFIG_PREEMPT_RT as well if no cached + * pages are available. + */ bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3353,16 +3360,33 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + bool success; void *ptr; + if (head) { + ptr = (void *) head - (unsigned long) func; + } else { + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + might_sleep(); + ptr = (unsigned long *) func; + } + krcp = krc_this_cpu_lock(&flags); - ptr = (void *)head - (unsigned long)func; // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); + + // Mark as success and leave. + success = true; goto unlock_return; } @@ -3370,10 +3394,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) { + success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); + if (!success) { + if (head == NULL) + // Inline if kvfree_rcu(one_arg) call. + goto unlock_return; + head->func = func; head->next = krcp->head; krcp->head = head; + success = true; } WRITE_ONCE(krcp->count, krcp->count + 1); @@ -3387,6 +3417,17 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: krc_this_cpu_unlock(krcp, flags); + + /* + * Inline kvfree() after synchronize_rcu(). We can do + * it from might_sleep() context only, so the current + * CPU can pass the QS state. + */ + if (!success) { + debug_rcu_head_unqueue((struct rcu_head *) ptr); + synchronize_rcu(); + kvfree(ptr); + } } EXPORT_SYMBOL_GPL(kvfree_call_rcu); -- cgit v1.2.3 From 26e760c9a7c8ec31fa1a6bfbbce3f63f189ccef0 Mon Sep 17 00:00:00 2001 From: Walter Wu Date: Thu, 6 Aug 2020 23:24:35 -0700 Subject: rcu: kasan: record and print call_rcu() call stack Patch series "kasan: memorize and print call_rcu stack", v8. This patchset improves KASAN reports by making them to have call_rcu() call stack information. It is useful for programmers to solve use-after-free or double-free memory issue. The KASAN report was as follows(cleaned up slightly): BUG: KASAN: use-after-free in kasan_rcu_reclaim+0x58/0x60 Freed by task 0: kasan_save_stack+0x24/0x50 kasan_set_track+0x24/0x38 kasan_set_free_info+0x18/0x20 __kasan_slab_free+0x10c/0x170 kasan_slab_free+0x10/0x18 kfree+0x98/0x270 kasan_rcu_reclaim+0x1c/0x60 Last call_rcu(): kasan_save_stack+0x24/0x50 kasan_record_aux_stack+0xbc/0xd0 call_rcu+0x8c/0x580 kasan_rcu_uaf+0xf4/0xf8 Generic KASAN will record the last two call_rcu() call stacks and print up to 2 call_rcu() call stacks in KASAN report. it is only suitable for generic KASAN. This feature considers the size of struct kasan_alloc_meta and kasan_free_meta, we try to optimize the structure layout and size, lets it get better memory consumption. [1]https://bugzilla.kernel.org/show_bug.cgi?id=198437 [2]https://groups.google.com/forum/#!searchin/kasan-dev/better$20stack$20traces$20for$20rcu%7Csort:date/kasan-dev/KQsjT_88hDE/7rNUZprRBgAJ This patch (of 4): This feature will record the last two call_rcu() call stacks and prints up to 2 call_rcu() call stacks in KASAN report. When call_rcu() is called, we store the call_rcu() call stack into slub alloc meta-data, so that the KASAN report can print rcu stack. [1]https://bugzilla.kernel.org/show_bug.cgi?id=198437 [2]https://groups.google.com/forum/#!searchin/kasan-dev/better$20stack$20traces$20for$20rcu%7Csort:date/kasan-dev/KQsjT_88hDE/7rNUZprRBgAJ [walter-zh.wu@mediatek.com: build fix] Link: http://lkml.kernel.org/r/20200710162401.23816-1-walter-zh.wu@mediatek.com Suggested-by: Dmitry Vyukov Signed-off-by: Walter Wu Signed-off-by: Andrew Morton Tested-by: Dmitry Vyukov Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Acked-by: Paul E. McKenney Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Josh Triplett Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Jonathan Corbet Cc: Matthias Brugger Link: http://lkml.kernel.org/r/20200710162123.23713-1-walter-zh.wu@mediatek.com Link: http://lkml.kernel.org/r/20200601050847.1096-1-walter-zh.wu@mediatek.com Link: http://lkml.kernel.org/r/20200601050927.1153-1-walter-zh.wu@mediatek.com Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 2 ++ kernel/rcu/tree.c | 2 ++ mm/kasan/common.c | 4 ++-- mm/kasan/generic.c | 21 +++++++++++++++++++++ mm/kasan/kasan.h | 9 +++++++++ mm/kasan/report.c | 28 +++++++++++++++++++++++----- 6 files changed, 59 insertions(+), 7 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 82522e996c76..18452e35e7b2 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -174,11 +174,13 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); +void kasan_record_aux_stack(void *ptr); #else /* CONFIG_KASAN_GENERIC */ static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} +static inline void kasan_record_aux_stack(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ac7198ed3197..8ce77d9ac716 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "../time/tick-internal.h" #include "tree.h" @@ -2890,6 +2891,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) head->func = func; head->next = NULL; local_irq_save(flags); + kasan_record_aux_stack(head); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 757d4074fe28..ad24666f50e4 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -40,7 +40,7 @@ #include "kasan.h" #include "../slab.h" -static inline depot_stack_handle_t save_stack(gfp_t flags) +depot_stack_handle_t kasan_save_stack(gfp_t flags) { unsigned long entries[KASAN_STACK_DEPTH]; unsigned int nr_entries; @@ -53,7 +53,7 @@ static inline depot_stack_handle_t save_stack(gfp_t flags) static inline void set_track(struct kasan_track *track, gfp_t flags) { track->pid = current->pid; - track->stack = save_stack(flags); + track->stack = kasan_save_stack(flags); } void kasan_enable_current(void) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 098a7dbaced6..d70586393b04 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -324,3 +324,24 @@ DEFINE_ASAN_SET_SHADOW(f2); DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); + +void kasan_record_aux_stack(void *addr) +{ + struct page *page = kasan_addr_to_page(addr); + struct kmem_cache *cache; + struct kasan_alloc_meta *alloc_info; + void *object; + + if (!(page && PageSlab(page))) + return; + + cache = page->slab_cache; + object = nearest_obj(cache, page, addr); + alloc_info = get_alloc_info(cache, object); + + /* + * record the last two call_rcu() call stacks. + */ + alloc_info->aux_stack[1] = alloc_info->aux_stack[0]; + alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); +} diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cfade6413528..f89a195e336a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -104,6 +104,13 @@ struct kasan_track { struct kasan_alloc_meta { struct kasan_track alloc_track; +#ifdef CONFIG_KASAN_GENERIC + /* + * call_rcu() call stack is stored into struct kasan_alloc_meta. + * The free stack is stored into struct kasan_free_meta. + */ + depot_stack_handle_t aux_stack[2]; +#endif struct kasan_track free_track[KASAN_NR_FREE_STACKS]; #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; @@ -159,6 +166,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip); struct page *kasan_addr_to_page(const void *addr); +depot_stack_handle_t kasan_save_stack(gfp_t flags); + #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 51ec45407a0b..445a9d56eb13 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -106,15 +106,20 @@ static void end_report(unsigned long *flags) kasan_enable_current(); } +static void print_stack(depot_stack_handle_t stack) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(stack, &entries); + stack_trace_print(entries, nr_entries, 0); +} + static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); if (track->stack) { - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(track->stack, &entries); - stack_trace_print(entries, nr_entries, 0); + print_stack(track->stack); } else { pr_err("(stack is not available)\n"); } @@ -193,6 +198,19 @@ static void describe_object(struct kmem_cache *cache, void *object, free_track = kasan_get_free_track(cache, object, tag); print_track(free_track, "Freed"); pr_err("\n"); + +#ifdef CONFIG_KASAN_GENERIC + if (alloc_info->aux_stack[0]) { + pr_err("Last call_rcu():\n"); + print_stack(alloc_info->aux_stack[0]); + pr_err("\n"); + } + if (alloc_info->aux_stack[1]) { + pr_err("Second to last call_rcu():\n"); + print_stack(alloc_info->aux_stack[1]); + pr_err("\n"); + } +#endif } describe_object_addr(cache, object, addr); -- cgit v1.2.3