summaryrefslogtreecommitdiff
path: root/kernel/sched/syscalls.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/syscalls.c')
-rw-r--r--kernel/sched/syscalls.c237
1 files changed, 63 insertions, 174 deletions
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index ae1b42775ef9..77ae87f36e84 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -57,7 +57,7 @@ static int effective_prio(struct task_struct *p)
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
- if (!rt_prio(p->prio))
+ if (!rt_or_dl_prio(p->prio))
return p->normal_prio;
return p->prio;
}
@@ -91,7 +91,7 @@ void set_user_nice(struct task_struct *p, long nice)
}
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
if (running)
@@ -174,7 +174,7 @@ SYSCALL_DEFINE1(nice, int, increment)
return 0;
}
-#endif
+#endif /* __ARCH_WANT_SYS_NICE */
/**
* task_prio - return the priority value of a given task.
@@ -209,10 +209,8 @@ int idle_cpu(int cpu)
if (rq->nr_running)
return 0;
-#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
-#endif
return 1;
}
@@ -255,109 +253,7 @@ int sched_core_idle_cpu(int cpu)
return idle_cpu(cpu);
}
-
-#endif
-
-#ifdef CONFIG_SMP
-/*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
- *
- * The scheduler tracks the following metrics:
- *
- * cpu_util_{cfs,rt,dl,irq}()
- * cpu_bw_dl()
- *
- * Where the cfs,rt and dl util numbers are tracked with the same metric and
- * synchronized windows and are thus directly comparable.
- *
- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
- * which excludes things like IRQ and steal-time. These latter are then accrued
- * in the IRQ utilization.
- *
- * The DL bandwidth number OTOH is not a measured metric but a value computed
- * based on the task model parameters and gives the minimal utilization
- * required to meet deadlines.
- */
-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long *min,
- unsigned long *max)
-{
- unsigned long util, irq, scale;
- struct rq *rq = cpu_rq(cpu);
-
- scale = arch_scale_cpu_capacity(cpu);
-
- /*
- * Early check to see if IRQ/steal time saturates the CPU, can be
- * because of inaccuracies in how we track these -- see
- * update_irq_load_avg().
- */
- irq = cpu_util_irq(rq);
- if (unlikely(irq >= scale)) {
- if (min)
- *min = scale;
- if (max)
- *max = scale;
- return scale;
- }
-
- if (min) {
- /*
- * The minimum utilization returns the highest level between:
- * - the computed DL bandwidth needed with the IRQ pressure which
- * steals time to the deadline task.
- * - The minimum performance requirement for CFS and/or RT.
- */
- *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
-
- /*
- * When an RT task is runnable and uclamp is not used, we must
- * ensure that the task will run at maximum compute capacity.
- */
- if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
- *min = max(*min, scale);
- }
-
- /*
- * Because the time spend on RT/DL tasks is visible as 'lost' time to
- * CFS tasks and we use the same metric to track the effective
- * utilization (PELT windows are synchronized) we can directly add them
- * to obtain the CPU's actual utilization.
- */
- util = util_cfs + cpu_util_rt(rq);
- util += cpu_util_dl(rq);
-
- /*
- * The maximum hint is a soft bandwidth requirement, which can be lower
- * than the actual utilization because of uclamp_max requirements.
- */
- if (max)
- *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
-
- if (util >= scale)
- return scale;
-
- /*
- * There is still idle time; further improve the number by using the
- * IRQ metric. Because IRQ/steal time is hidden from the task clock we
- * need to scale the task numbers:
- *
- * max - irq
- * U' = irq + --------- * U
- * max
- */
- util = scale_irq_capacity(util, irq, scale);
- util += irq;
-
- return min(scale, util);
-}
-
-unsigned long sched_cpu_util(int cpu)
-{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
-}
-#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_CORE */
/**
* find_process_by_pid - find a process with a matching PID value.
@@ -404,7 +300,15 @@ static void __setscheduler_params(struct task_struct *p,
if (dl_policy(policy))
__setparam_dl(p, attr);
else if (fair_policy(policy))
- p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ __setparam_fair(p, attr);
+
+ /* rt-policy tasks do not have a timerslack */
+ if (rt_or_dl_task_policy(p)) {
+ p->timer_slack_ns = 0;
+ } else if (p->timer_slack_ns == 0) {
+ /* when switching back to non-rt policy, restore timerslack */
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ }
/*
* __sched_setscheduler() ensures attr->sched_priority == 0 when
@@ -461,7 +365,7 @@ static int uclamp_validate(struct task_struct *p,
* blocking operation which obviously cannot be done while holding
* scheduler locks.
*/
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
return 0;
}
@@ -541,7 +445,7 @@ static inline int uclamp_validate(struct task_struct *p,
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
-#endif
+#endif /* !CONFIG_UCLAMP_TASK */
/*
* Allow unprivileged RT tasks to decrease priority.
@@ -612,7 +516,7 @@ int __sched_setscheduler(struct task_struct *p,
{
int oldpolicy = -1, policy = attr->sched_policy;
int retval, oldprio, newprio, queued, running;
- const struct sched_class *prev_class;
+ const struct sched_class *prev_class, *next_class;
struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
@@ -695,12 +599,18 @@ recheck:
goto unlock;
}
+ retval = scx_check_setscheduler(p, policy);
+ if (retval)
+ goto unlock;
+
/*
* If not changing anything there's no need to proceed further,
* but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ if (fair_policy(policy) &&
+ (attr->sched_nice != task_nice(p) ||
+ (attr->sched_runtime != p->se.slice)))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
@@ -721,14 +631,14 @@ change:
* Do not allow real-time tasks into groups that have no runtime
* assigned.
*/
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ if (rt_group_sched_enabled() &&
+ rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
retval = -EPERM;
goto unlock;
}
-#endif
-#ifdef CONFIG_SMP
+#endif /* CONFIG_RT_GROUP_SCHED */
if (dl_bandwidth_enabled() && dl_policy(policy) &&
!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
@@ -744,7 +654,6 @@ change:
goto unlock;
}
}
-#endif
}
/* Re-check policy now with rq lock held: */
@@ -783,20 +692,26 @@ change:
queue_flags &= ~DEQUEUE_MOVE;
}
+ prev_class = p->sched_class;
+ next_class = __setscheduler_class(policy, newprio);
+
+ if (prev_class != next_class && p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, queue_flags);
if (running)
put_prev_task(rq, p);
- prev_class = p->sched_class;
-
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
__setscheduler_params(p, attr);
- __setscheduler_prio(p, newprio);
+ p->sched_class = next_class;
+ p->prio = newprio;
}
__setscheduler_uclamp(p, attr);
+ check_class_changing(rq, p, prev_class);
if (queued) {
/*
@@ -846,6 +761,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
+ if (p->se.custom_slice)
+ attr.sched_runtime = p->se.slice;
+
/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
@@ -953,7 +871,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
- if (!param || pid < 0)
+ if (unlikely(!param || pid < 0))
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
@@ -1012,12 +930,14 @@ err_size:
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
- if (task_has_dl_policy(p))
+ if (task_has_dl_policy(p)) {
__getparam_dl(p, attr);
- else if (task_has_rt_policy(p))
+ } else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
- else
+ } else {
attr->sched_nice = task_nice(p);
+ attr->sched_runtime = p->se.slice;
+ }
}
/**
@@ -1060,7 +980,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
struct sched_attr attr;
int retval;
- if (!uattr || pid < 0 || flags)
+ if (unlikely(!uattr || pid < 0 || flags))
return -EINVAL;
retval = sched_copy_attr(uattr, &attr);
@@ -1125,7 +1045,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
struct task_struct *p;
int retval;
- if (!param || pid < 0)
+ if (unlikely(!param || pid < 0))
return -EINVAL;
scoped_guard (rcu) {
@@ -1147,45 +1067,6 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
}
-/*
- * Copy the kernel size attribute structure (which might be larger
- * than what user-space knows about) to user-space.
- *
- * Note that all cases are valid: user-space buffer can be larger or
- * smaller than the kernel-space buffer. The usual case is that both
- * have the same size.
- */
-static int
-sched_attr_copy_to_user(struct sched_attr __user *uattr,
- struct sched_attr *kattr,
- unsigned int usize)
-{
- unsigned int ksize = sizeof(*kattr);
-
- if (!access_ok(uattr, usize))
- return -EFAULT;
-
- /*
- * sched_getattr() ABI forwards and backwards compatibility:
- *
- * If usize == ksize then we just copy everything to user-space and all is good.
- *
- * If usize < ksize then we only copy as much as user-space has space for,
- * this keeps ABI compatibility as well. We skip the rest.
- *
- * If usize > ksize then user-space is using a newer version of the ABI,
- * which part the kernel doesn't know about. Just ignore it - tooling can
- * detect the kernel's knowledge of attributes from the attr->size value
- * which is set to ksize in this case.
- */
- kattr->size = min(usize, ksize);
-
- if (copy_to_user(uattr, kattr, kattr->size))
- return -EFAULT;
-
- return 0;
-}
-
/**
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
@@ -1200,8 +1081,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || usize > PAGE_SIZE ||
- usize < SCHED_ATTR_SIZE_VER0 || flags)
+ if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags))
return -EINVAL;
scoped_guard (rcu) {
@@ -1230,10 +1111,10 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
#endif
}
- return sched_attr_copy_to_user(uattr, &kattr, usize);
+ kattr.size = min(usize, sizeof(kattr));
+ return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
}
-#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
/*
@@ -1244,6 +1125,13 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
return 0;
/*
+ * The special/sugov task isn't part of regular bandwidth/admission
+ * control so let userspace change affinities.
+ */
+ if (dl_entity_is_special(&p->dl))
+ return 0;
+
+ /*
* Since bandwidth control happens on root_domain basis,
* if admission test is enabled, we only admit -deadline
* tasks allowed to run on all the CPUs in the task's
@@ -1255,7 +1143,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
return 0;
}
-#endif /* CONFIG_SMP */
int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
{
@@ -1304,7 +1191,7 @@ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
bool empty = !cpumask_and(new_mask, new_mask,
ctx->user_mask);
- if (WARN_ON_ONCE(empty))
+ if (empty)
cpumask_copy(new_mask, cpus_allowed);
}
__set_cpus_allowed_ptr(p, ctx);
@@ -1348,7 +1235,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
if (user_mask) {
cpumask_copy(user_mask, in_mask);
- } else if (IS_ENABLED(CONFIG_SMP)) {
+ } else {
return -ENOMEM;
}
@@ -1537,7 +1424,7 @@ int __sched yield_to(struct task_struct *p, bool preempt)
struct rq *rq, *p_rq;
int yielded = 0;
- scoped_guard (irqsave) {
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
rq = this_rq();
again:
@@ -1602,6 +1489,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
+ case SCHED_EXT:
ret = 0;
break;
}
@@ -1629,6 +1517,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
+ case SCHED_EXT:
ret = 0;
}
return ret;