diff options
Diffstat (limited to 'kernel/rcu/tree_plugin.h')
| -rw-r--r-- | kernel/rcu/tree_plugin.h | 122 | 
1 files changed, 102 insertions, 20 deletions
| diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0b0f56f6abc8..fc14adf15cbb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -486,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)  	struct rcu_node *rnp;  	union rcu_special special; +	rdp = this_cpu_ptr(&rcu_data); +	if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING) +		rdp->defer_qs_iw_pending = DEFER_QS_IDLE; +  	/*  	 * If RCU core is waiting for this CPU to exit its critical section,  	 * report the fact that it has exited.  Because irqs are disabled,  	 * t->rcu_read_unlock_special cannot change.  	 */  	special = t->rcu_read_unlock_special; -	rdp = this_cpu_ptr(&rcu_data);  	if (!special.s && !rdp->cpu_no_qs.b.exp) {  		local_irq_restore(flags);  		return; @@ -534,7 +537,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)  		WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&  			     (!empty_norm || rnp->qsmask));  		empty_exp = sync_rcu_exp_done(rnp); -		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */  		np = rcu_next_node_entry(t, rnp);  		list_del_init(&t->rcu_node_entry);  		t->rcu_blocked_node = NULL; @@ -624,10 +626,98 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t)   */  static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)  { +	unsigned long flags;  	struct rcu_data *rdp;  	rdp = container_of(iwp, struct rcu_data, defer_qs_iw); -	rdp->defer_qs_iw_pending = false; +	local_irq_save(flags); + +	/* +	 * If the IRQ work handler happens to run in the middle of RCU read-side +	 * critical section, it could be ineffective in getting the scheduler's +	 * attention to report a deferred quiescent state (the whole point of the +	 * IRQ work). For this reason, requeue the IRQ work. +	 * +	 * Basically, we want to avoid following situation: +	 * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING) +	 * 2. CPU enters new rcu_read_lock() +	 * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0 +	 * 4. rcu_read_unlock() does not re-queue work (state still PENDING) +	 * 5. Deferred QS reporting does not happen. +	 */ +	if (rcu_preempt_depth() > 0) +		WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); + +	local_irq_restore(flags); +} + +/* + * Check if expedited grace period processing during unlock is needed. + * + * This function determines whether expedited handling is required based on: + * 1. Task blocking an expedited grace period (based on a heuristic, could be + *    false-positive, see below.) + * 2. CPU participating in an expedited grace period + * 3. Strict grace period mode requiring expedited handling + * 4. RCU priority deboosting needs when interrupts were disabled + * + * @t: The task being checked + * @rdp: The per-CPU RCU data + * @rnp: The RCU node for this CPU + * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock() + * + * Returns true if expedited processing of the rcu_read_unlock() is needed. + */ +static bool rcu_unlock_needs_exp_handling(struct task_struct *t, +				      struct rcu_data *rdp, +				      struct rcu_node *rnp, +				      bool irqs_were_disabled) +{ +	/* +	 * Check if this task is blocking an expedited grace period. If the +	 * task was preempted within an RCU read-side critical section and is +	 * on the expedited grace period blockers list (exp_tasks), we need +	 * expedited handling to unblock the expedited GP. This is not an exact +	 * check because 't' might not be on the exp_tasks list at all - its +	 * just a fast heuristic that can be false-positive sometimes. +	 */ +	if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) +		return true; + +	/* +	 * Check if this CPU is participating in an expedited grace period. +	 * The expmask bitmap tracks which CPUs need to check in for the +	 * current expedited GP. If our CPU's bit is set, we need expedited +	 * handling to help complete the expedited GP. +	 */ +	if (rdp->grpmask & READ_ONCE(rnp->expmask)) +		return true; + +	/* +	 * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods +	 * are treated as short for testing purposes even if that means +	 * disturbing the system more. Check if either: +	 * - This CPU has not yet reported a quiescent state, or +	 * - This task was preempted within an RCU critical section +	 * In either case, require expedited handling for strict GP mode. +	 */ +	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && +	    ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) +		return true; + +	/* +	 * RCU priority boosting case: If a task is subject to RCU priority +	 * boosting and exits an RCU read-side critical section with interrupts +	 * disabled, we need expedited handling to ensure timely deboosting. +	 * Without this, a low-priority task could incorrectly run at high +	 * real-time priority for an extended period degrading real-time +	 * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels, +	 * not just to PREEMPT_RT. +	 */ +	if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node) +		return true; + +	return false;  }  /* @@ -649,18 +739,14 @@ static void rcu_read_unlock_special(struct task_struct *t)  	local_irq_save(flags);  	irqs_were_disabled = irqs_disabled_flags(flags);  	if (preempt_bh_were_disabled || irqs_were_disabled) { -		bool expboost; // Expedited GP in flight or possible boosting. +		bool needs_exp; // Expedited handling needed.  		struct rcu_data *rdp = this_cpu_ptr(&rcu_data);  		struct rcu_node *rnp = rdp->mynode; -		expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || -			   (rdp->grpmask & READ_ONCE(rnp->expmask)) || -			   (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && -			   ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) || -			   (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && -			    t->rcu_blocked_node); +		needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled); +  		// Need to defer quiescent state until everything is enabled. -		if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) { +		if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) {  			// Using softirq, safe to awaken, and either the  			// wakeup is free or there is either an expedited  			// GP in flight or a potential need to deboost. @@ -673,17 +759,13 @@ static void rcu_read_unlock_special(struct task_struct *t)  			set_tsk_need_resched(current);  			set_preempt_need_resched();  			if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && -			    expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { +			    needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && +			    cpu_online(rdp->cpu)) {  				// Get scheduler to re-evaluate and call hooks.  				// If !IRQ_WORK, FQS scan will eventually IPI. -				if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && -				    IS_ENABLED(CONFIG_PREEMPT_RT)) -					rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( -								rcu_preempt_deferred_qs_handler); -				else -					init_irq_work(&rdp->defer_qs_iw, -						      rcu_preempt_deferred_qs_handler); -				rdp->defer_qs_iw_pending = true; +				rdp->defer_qs_iw = +					IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); +				rdp->defer_qs_iw_pending = DEFER_QS_PENDING;  				irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);  			}  		} | 
