summaryrefslogtreecommitdiff
path: root/kernel/rcu/tree_plugin.h
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu/tree_plugin.h')
-rw-r--r--kernel/rcu/tree_plugin.h195
1 files changed, 144 insertions, 51 deletions
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c569da65b421..4cd170b2d655 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -24,11 +24,12 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
* timers have their own means of synchronization against the
* offloaded state updaters.
*/
- RCU_LOCKDEP_WARN(
+ RCU_NOCB_LOCKDEP_WARN(
!(lockdep_is_held(&rcu_state.barrier_mutex) ||
(IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
- rcu_lockdep_is_held_nocb(rdp) ||
- (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) &&
+ lockdep_is_held(&rdp->nocb_lock) ||
+ lockdep_is_held(&rcu_state.nocb_mutex) ||
+ ((!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) || softirq_count()) &&
rdp == this_cpu_ptr(&rcu_data)) ||
rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
@@ -182,9 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
switch (blkd_state) {
case 0:
case RCU_EXP_TASKS:
- case RCU_EXP_TASKS + RCU_GP_BLKD:
+ case RCU_EXP_TASKS | RCU_GP_BLKD:
case RCU_GP_TASKS:
- case RCU_GP_TASKS + RCU_EXP_TASKS:
+ case RCU_GP_TASKS | RCU_EXP_TASKS:
/*
* Blocking neither GP, or first task blocking the normal
@@ -197,10 +198,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
case RCU_EXP_BLKD:
case RCU_GP_BLKD:
- case RCU_GP_BLKD + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
+ case RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
/*
* First task arriving that blocks either GP, or first task
@@ -213,9 +214,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
break;
- case RCU_EXP_TASKS + RCU_EXP_BLKD:
- case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD:
+ case RCU_EXP_TASKS | RCU_EXP_BLKD:
+ case RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_EXP_BLKD:
/*
* Second or subsequent task blocking the expedited GP.
@@ -226,8 +227,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
list_add(&t->rcu_node_entry, rnp->exp_tasks);
break;
- case RCU_GP_TASKS + RCU_GP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
+ case RCU_GP_TASKS | RCU_GP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD:
/*
* Second or subsequent task blocking the normal GP.
@@ -274,6 +275,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
rcu_report_exp_rdp(rdp);
else
WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
}
/*
@@ -484,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
struct rcu_node *rnp;
union rcu_special special;
+ rdp = this_cpu_ptr(&rcu_data);
+ if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
+ rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
+
/*
* If RCU core is waiting for this CPU to exit its critical section,
* report the fact that it has exited. Because irqs are disabled,
* t->rcu_read_unlock_special cannot change.
*/
special = t->rcu_read_unlock_special;
- rdp = this_cpu_ptr(&rcu_data);
if (!special.s && !rdp->cpu_no_qs.b.exp) {
local_irq_restore(flags);
return;
@@ -532,7 +537,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
(!empty_norm || rnp->qsmask));
empty_exp = sync_rcu_exp_done(rnp);
- smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
t->rcu_blocked_node = NULL;
@@ -622,10 +626,98 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t)
*/
static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
{
+ unsigned long flags;
struct rcu_data *rdp;
rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
- rdp->defer_qs_iw_pending = false;
+ local_irq_save(flags);
+
+ /*
+ * If the IRQ work handler happens to run in the middle of RCU read-side
+ * critical section, it could be ineffective in getting the scheduler's
+ * attention to report a deferred quiescent state (the whole point of the
+ * IRQ work). For this reason, requeue the IRQ work.
+ *
+ * Basically, we want to avoid following situation:
+ * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING)
+ * 2. CPU enters new rcu_read_lock()
+ * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0
+ * 4. rcu_read_unlock() does not re-queue work (state still PENDING)
+ * 5. Deferred QS reporting does not happen.
+ */
+ if (rcu_preempt_depth() > 0)
+ WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Check if expedited grace period processing during unlock is needed.
+ *
+ * This function determines whether expedited handling is required based on:
+ * 1. Task blocking an expedited grace period (based on a heuristic, could be
+ * false-positive, see below.)
+ * 2. CPU participating in an expedited grace period
+ * 3. Strict grace period mode requiring expedited handling
+ * 4. RCU priority deboosting needs when interrupts were disabled
+ *
+ * @t: The task being checked
+ * @rdp: The per-CPU RCU data
+ * @rnp: The RCU node for this CPU
+ * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock()
+ *
+ * Returns true if expedited processing of the rcu_read_unlock() is needed.
+ */
+static bool rcu_unlock_needs_exp_handling(struct task_struct *t,
+ struct rcu_data *rdp,
+ struct rcu_node *rnp,
+ bool irqs_were_disabled)
+{
+ /*
+ * Check if this task is blocking an expedited grace period. If the
+ * task was preempted within an RCU read-side critical section and is
+ * on the expedited grace period blockers list (exp_tasks), we need
+ * expedited handling to unblock the expedited GP. This is not an exact
+ * check because 't' might not be on the exp_tasks list at all - its
+ * just a fast heuristic that can be false-positive sometimes.
+ */
+ if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks))
+ return true;
+
+ /*
+ * Check if this CPU is participating in an expedited grace period.
+ * The expmask bitmap tracks which CPUs need to check in for the
+ * current expedited GP. If our CPU's bit is set, we need expedited
+ * handling to help complete the expedited GP.
+ */
+ if (rdp->grpmask & READ_ONCE(rnp->expmask))
+ return true;
+
+ /*
+ * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods
+ * are treated as short for testing purposes even if that means
+ * disturbing the system more. Check if either:
+ * - This CPU has not yet reported a quiescent state, or
+ * - This task was preempted within an RCU critical section
+ * In either case, require expedited handling for strict GP mode.
+ */
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node))
+ return true;
+
+ /*
+ * RCU priority boosting case: If a task is subject to RCU priority
+ * boosting and exits an RCU read-side critical section with interrupts
+ * disabled, we need expedited handling to ensure timely deboosting.
+ * Without this, a low-priority task could incorrectly run at high
+ * real-time priority for an extended period degrading real-time
+ * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels,
+ * not just to PREEMPT_RT.
+ */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node)
+ return true;
+
+ return false;
}
/*
@@ -647,18 +739,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags);
irqs_were_disabled = irqs_disabled_flags(flags);
if (preempt_bh_were_disabled || irqs_were_disabled) {
- bool expboost; // Expedited GP in flight or possible boosting.
+ bool needs_exp; // Expedited handling needed.
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
- (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
- (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
- ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
- (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
- t->rcu_blocked_node);
+ needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled);
+
// Need to defer quiescent state until everything is enabled.
- if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
+ if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) {
// Using softirq, safe to awaken, and either the
// wakeup is free or there is either an expedited
// GP in flight or a potential need to deboost.
@@ -671,17 +759,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
set_tsk_need_resched(current);
set_preempt_need_resched();
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
- expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
+ needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
+ cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
- IS_ENABLED(CONFIG_PREEMPT_RT))
- rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
- rcu_preempt_deferred_qs_handler);
- else
- init_irq_work(&rdp->defer_qs_iw,
- rcu_preempt_deferred_qs_handler);
- rdp->defer_qs_iw_pending = true;
+ rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
}
@@ -820,6 +902,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
}
}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp)
+{
+ rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
+}
#else /* #ifdef CONFIG_PREEMPT_RCU */
/*
@@ -831,8 +917,17 @@ void rcu_read_unlock_strict(void)
{
struct rcu_data *rdp;
- if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread)
return;
+
+ /*
+ * rcu_report_qs_rdp() can only be invoked with a stable rdp and
+ * from the local CPU.
+ *
+ * The in_atomic_preempt_off() check ensures that we come here holding
+ * the last preempt_count (which will get dropped once we return to
+ * __rcu_read_unlock().
+ */
rdp = this_cpu_ptr(&rcu_data);
rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
@@ -869,7 +964,7 @@ static void rcu_qs(void)
/*
* Register an urgently needed quiescent state. If there is an
- * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * emergency, invoke rcu_momentary_eqs() to do a heavy-weight
* dyntick-idle quiescent state visible to other CPUs, which will in
* some cases serve for expedited as well as normal grace periods.
* Either way, register a lightweight quiescent state.
@@ -889,7 +984,7 @@ void rcu_all_qs(void)
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
local_irq_save(flags);
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
local_irq_restore(flags);
}
rcu_qs();
@@ -909,7 +1004,7 @@ void rcu_note_context_switch(bool preempt)
goto out;
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
out:
rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
@@ -973,13 +1068,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
*/
static void rcu_flavor_sched_clock_irq(int user)
{
- if (user || rcu_is_cpu_rrupt_from_idle()) {
+ if (user || rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) &&
+ (preempt_count() == HARDIRQ_OFFSET))) {
/*
* Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so note it.
+ * mode, from the idle loop without this being a nested
+ * interrupt, or while not holding the task preempt count
+ * (with PREEMPT_COUNT=y). In this case, the CPU is in a
+ * quiescent state, so note it.
*
* No memory barrier is required here because rcu_qs()
* references only CPU-local variables that other CPUs
@@ -1007,6 +1105,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { }
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/*
@@ -1216,16 +1316,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ rcu_thread_affine_rnp(t, rnp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
}
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
-{
- return READ_ONCE(rnp->boost_kthread_task);
-}
-
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1242,10 +1339,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
-{
- return NULL;
-}
#endif /* #else #ifdef CONFIG_RCU_BOOST */
/*